ORFEOME_Seq Pipeline¶
InĀ [1]:
# Improved version of Bowtie2 running from Kim's lab
# 1. Import all the libsraries
# 2. setting directories
# 3. Parse FASTA file
# 4. running bowtie2 in parallel
# 5. Merge read counts to one csv file
# 6. Making summary of log files
# 7. Getting read count from SAM files
# 8. Merging all read count into one xlsx file
InĀ [2]:
# Setting environment
# 1. Download Anaconda
# 2. Download Jupyter, Python, Pylance etc in Virtual Studio Code extension
# 3. Create a new conda environement in the terminal (conda create -n pipeline)
# 4. Download openpyxl and r-base
# 5. Go to setting and link r-base to least R 4.2.0
# 6. Download all the necessary python packages
# 7. Download all the necessary R packages
Table of Contents¶
- Normalization
- Installing R packages (optional)
- Import R packages
- Gene count summary
- Read count normalization (EdgeR, Bioinfokit)
- Noise detection
- Noise removal from dataframe
- Correlation matrix clustering (Pre-normalization)
- Box & Violin Plot (Pre-normalization)
- Quantile Normalization
- Box & Violin Plot (Post-Quantile Normalization)
- Batch correction (Limma)
- Correlation matrix clustering (Post-normalization)
Code initialization ¶
Import all the libraries ¶
InĀ [3]:
#Python default packages
import os
import pwd
import glob
import subprocess
import csv
import random
from concurrent.futures import ProcessPoolExecutor
import concurrent.futures
import multiprocessing
from multiprocessing import Pool, cpu_count
import threading
import signal
import sys
import tempfile
import warnings
from itertools import product
from itertools import combinations
import warnings
import gzip
import shutil
import tarfile
import time
from collections import OrderedDict
InĀ [4]:
from numba.core.errors import NumbaDeprecationWarning
# Suppress specific Numba deprecation warnings
warnings.filterwarnings('ignore', category=NumbaDeprecationWarning)
InĀ [5]:
#Python additional packages
import multiqc
import pysam
from tqdm import tqdm
from bioinfokit.analys import norm
from IPython.display import display, HTML
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib import gridspec
from matplotlib.patches import Patch
import matplotlib.ticker as mticker
from matplotlib.ticker import MaxNLocator, LogFormatter
from matplotlib_venn import venn3, venn2
from venn import venn
import pandas as pd
import numpy as np
from Bio import SeqIO
from kmodes.kmodes import KModes
from scipy.stats import rankdata
from scipy.stats import chi2, t, norm
from scipy.stats import pearsonr, spearmanr, kendalltau, multiscale_graphcorr
from statsmodels.distributions.empirical_distribution import ECDF
from scipy.interpolate import interp1d
from scipy.cluster import hierarchy
from scipy.spatial.distance import pdist
from scipy.stats import gamma, norm, poisson
from scipy.optimize import brentq
from scipy.optimize import minimize
from scipy.spatial import distance_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse import coo_matrix
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import webcolors
from webcolors import hex_to_rgb
import umap
import hdbscan
%matplotlib inline
# %matplotlib agg
# %matplotlib agg
InĀ [6]:
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Suppress DeprecationWarnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)
InĀ [7]:
plt.rcParams['axes.labelsize'] = 35 # For X and Y labels
plt.rcParams['xtick.labelsize'] = 30# For X-axis tick labels
plt.rcParams['ytick.labelsize'] = 30 # For Y-axis tick labels
plt.rcParams['legend.fontsize'] = 30 # For the legend
plt.rcParams['axes.titlesize'] = 30 # For the plot title
plt.rcParams['axes.linewidth'] = 1.5 # For the axes lines
plt.rcParams['font.size'] = 20 # For all text in the plot
plt.rcParams['axes.labelpad'] = 20
plt.rcParams['svg.fonttype'] = 'none'
InĀ [984]:
plt.rcParams['svg.fonttype'] = 'none'
InĀ [8]:
# # DAVID related packages
# import logging
# import traceback as tb
# import suds.metrics as metrics
# from tests import *
# from suds import *
# from suds.client import Client
# from datetime import datetime
InĀ [9]:
#Python package for R
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import StrVector, FloatVector
from rpy2.robjects import pandas2ri
from rpy2.robjects import FactorVector
from rpy2.robjects.vectors import DataFrame
pandas2ri.activate()
import rpy2.robjects.numpy2ri as numpy2ri
from rpy2.robjects.conversion import localconverter
Setting directories ¶
InĀ [10]:
# Define the base directory variable
directory = "RQ023682"
# Define User ID
uid = os.getuid()
user_info = pwd.getpwuid(uid)
user_id = user_info.pw_name
# Define the base directory
base_dir = f"~/rnaseq_analysis/{directory}"
# Expand the tilde (~) in the base directory path
base_dir = os.path.expanduser(base_dir)
# Get the list of RS-* folders
folders = glob.glob(os.path.join(base_dir, "RS-*"))
# Create the output .Cut directory
output_cut_dir = os.path.join(base_dir, f"{directory}_cut")
os.makedirs(output_cut_dir, exist_ok=True)
# Create the output .Trim directory
output_trim_dir = os.path.join(base_dir, f"{directory}_trim")
os.makedirs(output_trim_dir, exist_ok=True)
# Create the output .sam directory
output_sam_dir = os.path.join(base_dir, f"{directory}_sam")
os.makedirs(output_sam_dir, exist_ok=True)
# Create the output .bam directory
output_bam_dir = os.path.join(base_dir, f"{directory}_bam")
os.makedirs(output_bam_dir, exist_ok=True)
# Create the graph storing directory
graphs_files = os.path.join(base_dir, directory+ "_graphs")
os.makedirs(graphs_files, exist_ok=True)
# Create the graph storing for the original
graphs_files_original = os.path.join(graphs_files, "Original_Graph" )
os.makedirs(graphs_files_original, exist_ok=True)
# Create the graph storing for the statistical testing
graphs_files_stats = os.path.join(graphs_files, "Statistical_Graph" )
os.makedirs(graphs_files_stats, exist_ok=True)
# Create the database storing directory
database_files = os.path.join(base_dir, directory+ "_db")
os.makedirs(database_files, exist_ok=True)
# Create the database storing directory
database_files_original = os.path.join(database_files, "Original_db")
os.makedirs(database_files_original, exist_ok=True)
# Create the database storing directory
database_files_stats = os.path.join(database_files, "Statistical_db")
os.makedirs(database_files_stats, exist_ok=True)
# Setting working directory
PATH = base_dir
os.chdir(PATH)
Index reference genome and iterate sample names ¶
InĀ [11]:
# Extract the reference directory from the index path
reference_dir = f"/home/{user_id}/rnaseq_analysis/Reference/9.1_delta/"
os.chdir(reference_dir)
ref_name = "9'1_delta_final_LP_BxB.fa"
ref_dir = os.path.join(reference_dir, ref_name)
# Split the file name by '.' and take the first part as the index base name
index_base_name = ref_name.split('.')[0]
# Check if the Bowtie2 index files already exist
index_files_exist = all(os.path.exists(f"{index_base_name}.{ext}") for ext in ["1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"])
if not index_files_exist:
# Command to run bowtie2-build with the reference_dir as input and index_base_name as output
cmd = ["bowtie2-build", ref_dir, index_base_name]
try:
# Run bowtie2-build using subprocess
subprocess.run(cmd, check=True)
print("bowtie2-build completed successfully.")
except subprocess.CalledProcessError as e:
print(f"Error occurred while running bowtie2-build: {e}")
else:
print("Bowtie2 index files already exist. Skipping the indexing process.")
reference_index_dir = os.path.join(reference_dir, index_base_name)
# Create an empty dictionary
sample_key = {}
# Iterate sample names
for folder in folders:
r1_names = glob.glob(os.path.join(folder, "*R1_001.fastq.gz"))
r2_names = glob.glob(os.path.join(folder, "*R2_001.fastq.gz"))
if r1_names and r2_names and len(r1_names) == len(r2_names):
for r1_name, r2_name in zip(r1_names, r2_names):
r1 = os.path.basename(r1_name).split("_")[1]
r2 = os.path.basename(r2_name).split("_")[1]
if r1 == r2:
folder_name = os.path.basename(folder)
if folder_name not in sample_key:
sample_key[folder_name] = []
sample_key[folder_name].append(r1)
sample_key = dict(sorted(sample_key.items()))
Bowtie2 index files already exist. Skipping the indexing process.
Chemotherapeutic categories and colors ¶
InĀ [12]:
name_list = ['DMSO',
'Paclitaxel', 'Cisplatin', 'TFT', 'FdU', 'EdU',
'Doxorubicin', '5FU', 'Carboplatin', 'Bleomycin', 'Etoposide',
'MitomycinC', 'Carmustine', 'Irinotecan', '6mercaptopurine',
'Vinblastine', 'TAS102']
InĀ [13]:
# Final category
drug_category = {
"Control": ["DMSO", "Baseline", "mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative", "Serumfree"],
"Antimetabolite": ["TFT", "TAS102", "FdU", "EdU", "5FU", "6mercaptopurine"],
"DNA cross linking agent": ["Cisplatin", "Carboplatin", "Carmustine", "MitomycinC"],
"DNA strand break agent": ["Doxorubicin", "Etoposide", "Irinotecan", "Bleomycin"],
"Microtubule inhibitor": ["Paclitaxel","Vinblastine"],
}
# Final category color code
category_colors = {
"Control": "#E3E3DD", # Pantone 9101 C Color
"Antimetabolite": "#B03A2E", # Close to Medium Carmine
"DNA cross linking agent": "#28B463", # Medium Sea Green
"DNA strand break agent": "#7C3B97", # Cadmium Violet
"Microtubule inhibitor": "#2F86C1", # Boyzone
}
# Final chemotherapy color code
drug_color_map = {
# Control
"DMSO": "#EAEAE6", # White Sail (DMSO)
"Baseline": "#F1F1EF", # Bleached Silk (Baseline)
"mCherryPositive&BFPNegative" : "#F5F5F4", # Ivory (mCherry+/BFP-)
"mCherryNegative&BFPNegative" : "#F5F5F4", # Ivory (mCherry-/BFP-)
"Serumfree" : "#F9F9F8", # Light Ivory (Serum Free)
# Antimetabolite
"TFT": "#EC7063", # Terra Cotta (5-FU)
"TAS102": "#FADBD8", # Light grayish red (TAS102)
"FdU": "#FDEDEC", # Provincial Pink (FdU)
"EdU": "#F5B7B1", # Beauty Bush (TFT)
"5FU": "#F5AFA8", # Sundown (5FU)
"6mercaptopurine": "#F1948A", # Sweet Pink (6-mercaptopurine)
# Cross Linking Drug
"Cisplatin": "#82E0AA", # Pearl Aqua (Carboplatin)
"Carboplatin": "#ABEBC6", # Magic Mint (Cisplatin)
"Carmustine" : "#D5F5E3", # Aero Blue (Mitomycin C)
"MitomycinC": "#EAFAF1", # Pale green (Carmustine)
# DNA strand-break agent
"Doxorubicin": "#BB8FCE", # Amethyst Show (Bleomycin)
"Bleomycin" : "#EBDEF0", # White Lilac (Doxorubicin)
"Etoposide": "#F5EEF8", # AliceBlue color (Etoposide)
"Irinotecan": "#D7BDE2", # Pretty Petunia (Irinotecan)
# Microtubule inhibitor
"Paclitaxel": "#93C8EC", # Fail Whale (Paclitaxel)
"Vinblastine": "#D6EAF8", # cyan-blue (Vinblastine)
}
Assign color map to excel file ¶
InĀ [14]:
# Helper function to get color name from RGB
def get_color_name_from_hex(hex_code):
try:
return webcolors.hex_to_name(hex_code, spec='css3')
except ValueError:
return "Unknown"
# Helper function to convert RGB to CMYK
def rgb_to_cmyk(rgb):
r, g, b = rgb
c = 1 - r / 255
m = 1 - g / 255
y = 1 - b / 255
k = min(c, m, y)
if k == 1:
c = m = y = 0
else:
c = round((c - k) / (1 - k) * 100)
m = round((m - k) / (1 - k) * 100)
y = round((y - k) / (1 - k) * 100)
k = round(k * 100)
return c, m, y, k
# Create the DataFrame
data = []
for drug, hex_code in drug_color_map.items():
rgb = hex_to_rgb(hex_code)
color_name = get_color_name_from_hex(hex_code)
cmyk = rgb_to_cmyk(rgb)
category = None
category_color = None
for key, value in drug_category.items():
if drug in (value if isinstance(value, list) else [value]):
category = key
category_color = category_colors[key]
break
data.append([drug, category, category_color, color_name, rgb, hex_code, cmyk])
columns = ["Drug", "Category", "Category_Color", "Color_Name", "RGB", "Drug_Hex_Code", "CMYK"]
color_df = pd.DataFrame(data, columns=columns)
# Save the DataFrame
color_df_path = os.path.join(base_dir, f"{directory}_color.xlsx")
color_df.to_excel(color_df_path)
Parse FASTA files ¶
InĀ [15]:
fasta_rows = []
for record in SeqIO.parse(ref_dir, "fasta"):
# Get the gene name from the record description
gene_name = record.description.split()[0]
# Split the gene name by "_"
gene_parts = gene_name.split("_")
# Extract the additional columns
ORFeome_ID = gene_parts[0]
NCBI_no = gene_parts[1]
group_no = gene_parts[2]
gene_ID = "_".join(gene_parts[3:])
# Count the number of bases in the sequence
base_count = len(record.seq)
# Calculate GC content
gc_content = (record.seq.count("G") + record.seq.count("C")) / base_count * 100
# Create a row for the CSV file
fasta_row = [gene_name, ORFeome_ID, NCBI_no, group_no, gene_ID, base_count, gc_content]
# Add the row to the list of CSV rows
fasta_rows.append(fasta_row)
# Path to the output CSV file
fasta_file = os.path.join(base_dir, "fasta_index.csv")
# Write the gene names, base counts, and GC content to the CSV file
with open(fasta_file, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["ID", "ORF_ID", "NCBI", "Group", "Gene_Symbol", "Length", "GC_Content"])
writer.writerows(fasta_rows)
print(f"CSV file saved to {fasta_file}")
CSV file saved to /home/harryjo/rnaseq_analysis/RQ023682/fasta_index.csv
Pre-processing ¶
FASTQ ā Pre QC ¶
InĀ [16]:
# Create a list to store the FastQC subprocess instances
prefastqc_processes = []
# Specify the folder for FastQC output
folder_fastqc_dir = os.path.join(base_dir, f"{directory}_Pre_FASTQC")
# Create the FastQC output directory if it doesn't exist
os.makedirs(folder_fastqc_dir, exist_ok=True)
for folder in folders:
r1_files = glob.glob(os.path.join(folder, "*R1_001.fastq.gz"))[0]
r2_files = glob.glob(os.path.join(folder, "*R2_001.fastq.gz"))[0]
if not r1_files or not r2_files:
print(f"Skipping {os.path.basename(folder)} - R1 or R2 files not found.")
continue
folder_name = os.path.basename(folder)
# Create a subfolder for the current sample's FastQC output
sample_fastqc_dir = os.path.join(folder_fastqc_dir, folder_name)
os.makedirs(sample_fastqc_dir, exist_ok=True)
r1_files_name = os.path.basename(r1_files).split('.')[0]
r2_files_name = os.path.basename(r2_files).split('.')[0]
r1_exist = os.path.join(sample_fastqc_dir, f"{os.path.basename(r1_files_name)}_fastqc.html")
r2_exist = os.path.join(sample_fastqc_dir, f"{os.path.basename(r1_files_name)}_fastqc.html")
# Check if FastQC output already exists for R1 file
if os.path.exists(r1_exist):
print(f"Skipping {folder_name} - FastQC output already exists for R1")
else:
# Run FastQC for R1 file
command_r1 = [
"fastqc",
"-f", "fastq",
"--extract",
"-t", "9",
"-o", sample_fastqc_dir,
r1_files
]
process_r1 = subprocess.Popen(command_r1, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
prefastqc_processes.append(process_r1)
# Check if FastQC output already exists for R2 file
if os.path.exists(r2_exist):
print(f"Skipping {folder_name} - FastQC output already exists for R2")
else:
# Run FastQC for R2 file
command_r2 = [
"fastqc",
"-f", "fastq",
"--extract",
"-t", "9",
"-o", sample_fastqc_dir,
r2_files
]
process_r2 = subprocess.Popen(command_r2, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
prefastqc_processes.append(process_r2)
# Wait for all subprocesses to finish and re-run ones with errors
for process in prefastqc_processes:
_, stderr_output = process.communicate()
if process.returncode != 0:
print(f"Error message: {stderr_output.decode('utf-8')}")
# Re-run the same FastQC command
re_run_process = subprocess.Popen(
process.args, # Re-run the same command
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
_, re_run_stderr = re_run_process.communicate()
if re_run_process.returncode != 0:
print("Failed to re-run FastQC.")
print("FastQC for all R1 and R2 files completed.")
Skipping RS-03984980 - FastQC output already exists for R1 Skipping RS-03984980 - FastQC output already exists for R2 Skipping RS-03985010 - FastQC output already exists for R1 Skipping RS-03985010 - FastQC output already exists for R2 Skipping RS-03985029 - FastQC output already exists for R1 Skipping RS-03985029 - FastQC output already exists for R2 Skipping RS-03985030 - FastQC output already exists for R1 Skipping RS-03985030 - FastQC output already exists for R2 Skipping RS-03985031 - FastQC output already exists for R1 Skipping RS-03985031 - FastQC output already exists for R2 Skipping RS-03985032 - FastQC output already exists for R1 Skipping RS-03985032 - FastQC output already exists for R2 Skipping RS-03985033 - FastQC output already exists for R1 Skipping RS-03985033 - FastQC output already exists for R2 Skipping RS-03985034 - FastQC output already exists for R1 Skipping RS-03985034 - FastQC output already exists for R2 Skipping RS-03985035 - FastQC output already exists for R1 Skipping RS-03985035 - FastQC output already exists for R2 Skipping RS-03985036 - FastQC output already exists for R1 Skipping RS-03985036 - FastQC output already exists for R2 Skipping RS-03985037 - FastQC output already exists for R1 Skipping RS-03985037 - FastQC output already exists for R2 Skipping RS-03985038 - FastQC output already exists for R1 Skipping RS-03985038 - FastQC output already exists for R2 Skipping RS-03985039 - FastQC output already exists for R1 Skipping RS-03985039 - FastQC output already exists for R2 Skipping RS-03985040 - FastQC output already exists for R1 Skipping RS-03985040 - FastQC output already exists for R2 Skipping RS-03985041 - FastQC output already exists for R1 Skipping RS-03985041 - FastQC output already exists for R2 Skipping RS-04068630 - FastQC output already exists for R1 Skipping RS-04068630 - FastQC output already exists for R2 Skipping RS-04068631 - FastQC output already exists for R1 Skipping RS-04068631 - FastQC output already exists for R2 Skipping RS-04068632 - FastQC output already exists for R1 Skipping RS-04068632 - FastQC output already exists for R2 Skipping RS-04068633 - FastQC output already exists for R1 Skipping RS-04068633 - FastQC output already exists for R2 Skipping RS-04068634 - FastQC output already exists for R1 Skipping RS-04068634 - FastQC output already exists for R2 Skipping RS-04068635 - FastQC output already exists for R1 Skipping RS-04068635 - FastQC output already exists for R2 Skipping RS-04068636 - FastQC output already exists for R1 Skipping RS-04068636 - FastQC output already exists for R2 Skipping RS-03984981 - FastQC output already exists for R1 Skipping RS-03984981 - FastQC output already exists for R2 Skipping RS-03984982 - FastQC output already exists for R1 Skipping RS-03984982 - FastQC output already exists for R2 Skipping RS-03984983 - FastQC output already exists for R1 Skipping RS-03984983 - FastQC output already exists for R2 Skipping RS-03984984 - FastQC output already exists for R1 Skipping RS-03984984 - FastQC output already exists for R2 Skipping RS-03984985 - FastQC output already exists for R1 Skipping RS-03984985 - FastQC output already exists for R2 Skipping RS-03984986 - FastQC output already exists for R1 Skipping RS-03984986 - FastQC output already exists for R2 Skipping RS-03984987 - FastQC output already exists for R1 Skipping RS-03984987 - FastQC output already exists for R2 Skipping RS-03984988 - FastQC output already exists for R1 Skipping RS-03984988 - FastQC output already exists for R2 Skipping RS-03984989 - FastQC output already exists for R1 Skipping RS-03984989 - FastQC output already exists for R2 Skipping RS-03984990 - FastQC output already exists for R1 Skipping RS-03984990 - FastQC output already exists for R2 Skipping RS-03984991 - FastQC output already exists for R1 Skipping RS-03984991 - FastQC output already exists for R2 Skipping RS-03984992 - FastQC output already exists for R1 Skipping RS-03984992 - FastQC output already exists for R2 Skipping RS-03984993 - FastQC output already exists for R1 Skipping RS-03984993 - FastQC output already exists for R2 Skipping RS-03984995 - FastQC output already exists for R1 Skipping RS-03984995 - FastQC output already exists for R2 Skipping RS-03984996 - FastQC output already exists for R1 Skipping RS-03984996 - FastQC output already exists for R2 Skipping RS-03984997 - FastQC output already exists for R1 Skipping RS-03984997 - FastQC output already exists for R2 Skipping RS-03984998 - FastQC output already exists for R1 Skipping RS-03984998 - FastQC output already exists for R2 Skipping RS-03984999 - FastQC output already exists for R1 Skipping RS-03984999 - FastQC output already exists for R2 Skipping RS-03985000 - FastQC output already exists for R1 Skipping RS-03985000 - FastQC output already exists for R2 Skipping RS-03985001 - FastQC output already exists for R1 Skipping RS-03985001 - FastQC output already exists for R2 Skipping RS-03985002 - FastQC output already exists for R1 Skipping RS-03985002 - FastQC output already exists for R2 Skipping RS-03985003 - FastQC output already exists for R1 Skipping RS-03985003 - FastQC output already exists for R2 Skipping RS-03985004 - FastQC output already exists for R1 Skipping RS-03985004 - FastQC output already exists for R2 Skipping RS-03985005 - FastQC output already exists for R1 Skipping RS-03985005 - FastQC output already exists for R2 Skipping RS-03985006 - FastQC output already exists for R1 Skipping RS-03985006 - FastQC output already exists for R2 Skipping RS-03985007 - FastQC output already exists for R1 Skipping RS-03985007 - FastQC output already exists for R2 Skipping RS-03985008 - FastQC output already exists for R1 Skipping RS-03985008 - FastQC output already exists for R2 Skipping RS-03985009 - FastQC output already exists for R1 Skipping RS-03985009 - FastQC output already exists for R2 Skipping RS-03984975 - FastQC output already exists for R1 Skipping RS-03984975 - FastQC output already exists for R2 Skipping RS-03984976 - FastQC output already exists for R1 Skipping RS-03984976 - FastQC output already exists for R2 Skipping RS-03984977 - FastQC output already exists for R1 Skipping RS-03984977 - FastQC output already exists for R2 Skipping RS-03984978 - FastQC output already exists for R1 Skipping RS-03984978 - FastQC output already exists for R2 Skipping RS-03984979 - FastQC output already exists for R1 Skipping RS-03984979 - FastQC output already exists for R2 Skipping RS-03985011 - FastQC output already exists for R1 Skipping RS-03985011 - FastQC output already exists for R2 Skipping RS-03985012 - FastQC output already exists for R1 Skipping RS-03985012 - FastQC output already exists for R2 Skipping RS-03985013 - FastQC output already exists for R1 Skipping RS-03985013 - FastQC output already exists for R2 Skipping RS-03985014 - FastQC output already exists for R1 Skipping RS-03985014 - FastQC output already exists for R2 Skipping RS-03985015 - FastQC output already exists for R1 Skipping RS-03985015 - FastQC output already exists for R2 Skipping RS-03985016 - FastQC output already exists for R1 Skipping RS-03985016 - FastQC output already exists for R2 Skipping RS-03985017 - FastQC output already exists for R1 Skipping RS-03985017 - FastQC output already exists for R2 Skipping RS-03985018 - FastQC output already exists for R1 Skipping RS-03985018 - FastQC output already exists for R2 Skipping RS-03985019 - FastQC output already exists for R1 Skipping RS-03985019 - FastQC output already exists for R2 Skipping RS-03985020 - FastQC output already exists for R1 Skipping RS-03985020 - FastQC output already exists for R2 Skipping RS-03985021 - FastQC output already exists for R1 Skipping RS-03985021 - FastQC output already exists for R2 Skipping RS-03985022 - FastQC output already exists for R1 Skipping RS-03985022 - FastQC output already exists for R2 Skipping RS-03985023 - FastQC output already exists for R1 Skipping RS-03985023 - FastQC output already exists for R2 Skipping RS-03985024 - FastQC output already exists for R1 Skipping RS-03985024 - FastQC output already exists for R2 Skipping RS-03985025 - FastQC output already exists for R1 Skipping RS-03985025 - FastQC output already exists for R2 Skipping RS-03985026 - FastQC output already exists for R1 Skipping RS-03985026 - FastQC output already exists for R2 Skipping RS-03985027 - FastQC output already exists for R1 Skipping RS-03985027 - FastQC output already exists for R2 Skipping RS-03985028 - FastQC output already exists for R1 Skipping RS-03985028 - FastQC output already exists for R2 FastQC for all R1 and R2 files completed.
Pre-QC MultiQC analysis ¶
InĀ [17]:
# Specify the folder and file name for the MultiQC report
multiqc_output_dir = os.path.join(base_dir, f"{directory}_Pre-QC_MultiQC")
os.makedirs(multiqc_output_dir, exist_ok=True)
multiqc_report_file = os.path.join(multiqc_output_dir, f"{directory}_Pre-QC_multiqc_report.html")
# Generate the MultiQC report with --interactive flag
if os.path.exists(multiqc_report_file):
print("Skipping MultiQC - Report file already exists.")
else:
# Generate the MultiQC report with --interactive flag
multiqc_command = [
"multiqc",
"--interactive",
folder_fastqc_dir,
"-o", multiqc_output_dir,
"--filename", f"{directory}_Pre-QC_multiqc_report.html"
]
subprocess.run(multiqc_command, check=True)
print("MultiQC report generation completed.")
# # Display the MultiQC report in the notebook
# display(HTML(filename=os.path.join(PATH, "multiqc_report.html")))
Skipping MultiQC - Report file already exists. MultiQC report generation completed.
Adapter Trimming (Cutadapt) ¶
InĀ [18]:
# Define adapter sequences for R1 and R2
Truseq_adapter_sequence_R1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"
Truseq_adapter_sequence_R2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"
# PCR1_primers_R1 = "CACCCACACAAAGGAAAAGGG"
# PCR1_primers_R2 = "CACATTGCCAAAAGACGGCA"
# PCR2_primers_R1 = "NNNNNCTCACAAGTTTGTACAAAAAAG"
# PCR2_primers_R2 = "NNNNNTGACCACTTTGTACAAGAAAG"
# List to store Cutadapt subprocess instances
cutadapt_processes = []
# Iterate over the RS-* folders
for folder in folders:
r1_files = glob.glob(os.path.join(folder, "*R1_001.fastq.gz"))[0]
r2_files = glob.glob(os.path.join(folder, "*R2_001.fastq.gz"))[0]
if not r1_files or not r2_files:
print(f"Skipping {os.path.basename(folder)} - R1 or R2 files not found.")
continue
folder_name = os.path.basename(folder)
# Create a subfolder for the current sample's FastQC output
sample_cut_dir = os.path.join(output_cut_dir, folder_name)
os.makedirs(sample_cut_dir, exist_ok=True)
# File names without extensions
r1_files_name = os.path.basename(r1_files).split('.')[0]
r2_files_name = os.path.basename(r2_files).split('.')[0]
# Output file paths in the output_cut_dir
r1_output_file = os.path.join(sample_cut_dir, f"{r1_files_name}.trimmed.fastq.gz")
r2_output_file = os.path.join(sample_cut_dir, f"{r2_files_name}.trimmed.fastq.gz")
if os.path.exists(r1_output_file) and os.path.exists(r2_output_file):
print(f"Skipping {folder_name} - Cutadapt output already exists for R1 and R2")
else:
# Create a log directory inside the output_cut_dir
log_directory = os.path.join(output_cut_dir, "cutadapt_logs")
os.makedirs(log_directory, exist_ok=True)
# Define log file path for stdout
stdout_log_path = os.path.join(log_directory, f"{folder_name}_cutadapt_stdout.log")
# Run Cutadapt for R1 and R2 files
command = [
"cutadapt",
"-a", Truseq_adapter_sequence_R1,
"-A", Truseq_adapter_sequence_R2,
"-o", r1_output_file,
"-p", r2_output_file,
r1_files,
r2_files
]
# Open log file for stdout
stdout_log_file = open(stdout_log_path, "w")
# Run the process and store it in the list
process = subprocess.Popen(command, stdout=stdout_log_file, stderr=subprocess.PIPE)
cutadapt_processes.append(process)
# Wait for all Cutadapt subprocesses to finish
for process in cutadapt_processes:
_, stderr_output = process.communicate()
if process.returncode != 0:
print(f"Error message: {stderr_output.decode('utf-8')}")
print("Cutadapt for all R1 and R2 files completed.")
Skipping RS-03984980 - Cutadapt output already exists for R1 and R2 Skipping RS-03985010 - Cutadapt output already exists for R1 and R2 Skipping RS-03985029 - Cutadapt output already exists for R1 and R2 Skipping RS-03985030 - Cutadapt output already exists for R1 and R2 Skipping RS-03985031 - Cutadapt output already exists for R1 and R2 Skipping RS-03985032 - Cutadapt output already exists for R1 and R2 Skipping RS-03985033 - Cutadapt output already exists for R1 and R2 Skipping RS-03985034 - Cutadapt output already exists for R1 and R2 Skipping RS-03985035 - Cutadapt output already exists for R1 and R2 Skipping RS-03985036 - Cutadapt output already exists for R1 and R2 Skipping RS-03985037 - Cutadapt output already exists for R1 and R2 Skipping RS-03985038 - Cutadapt output already exists for R1 and R2 Skipping RS-03985039 - Cutadapt output already exists for R1 and R2 Skipping RS-03985040 - Cutadapt output already exists for R1 and R2 Skipping RS-03985041 - Cutadapt output already exists for R1 and R2 Skipping RS-04068630 - Cutadapt output already exists for R1 and R2 Skipping RS-04068631 - Cutadapt output already exists for R1 and R2 Skipping RS-04068632 - Cutadapt output already exists for R1 and R2 Skipping RS-04068633 - Cutadapt output already exists for R1 and R2 Skipping RS-04068634 - Cutadapt output already exists for R1 and R2 Skipping RS-04068635 - Cutadapt output already exists for R1 and R2 Skipping RS-04068636 - Cutadapt output already exists for R1 and R2 Skipping RS-03984981 - Cutadapt output already exists for R1 and R2 Skipping RS-03984982 - Cutadapt output already exists for R1 and R2 Skipping RS-03984983 - Cutadapt output already exists for R1 and R2 Skipping RS-03984984 - Cutadapt output already exists for R1 and R2 Skipping RS-03984985 - Cutadapt output already exists for R1 and R2 Skipping RS-03984986 - Cutadapt output already exists for R1 and R2 Skipping RS-03984987 - Cutadapt output already exists for R1 and R2 Skipping RS-03984988 - Cutadapt output already exists for R1 and R2 Skipping RS-03984989 - Cutadapt output already exists for R1 and R2 Skipping RS-03984990 - Cutadapt output already exists for R1 and R2 Skipping RS-03984991 - Cutadapt output already exists for R1 and R2 Skipping RS-03984992 - Cutadapt output already exists for R1 and R2 Skipping RS-03984993 - Cutadapt output already exists for R1 and R2 Skipping RS-03984995 - Cutadapt output already exists for R1 and R2 Skipping RS-03984996 - Cutadapt output already exists for R1 and R2 Skipping RS-03984997 - Cutadapt output already exists for R1 and R2 Skipping RS-03984998 - Cutadapt output already exists for R1 and R2 Skipping RS-03984999 - Cutadapt output already exists for R1 and R2 Skipping RS-03985000 - Cutadapt output already exists for R1 and R2 Skipping RS-03985001 - Cutadapt output already exists for R1 and R2 Skipping RS-03985002 - Cutadapt output already exists for R1 and R2 Skipping RS-03985003 - Cutadapt output already exists for R1 and R2 Skipping RS-03985004 - Cutadapt output already exists for R1 and R2 Skipping RS-03985005 - Cutadapt output already exists for R1 and R2 Skipping RS-03985006 - Cutadapt output already exists for R1 and R2 Skipping RS-03985007 - Cutadapt output already exists for R1 and R2 Skipping RS-03985008 - Cutadapt output already exists for R1 and R2 Skipping RS-03985009 - Cutadapt output already exists for R1 and R2 Skipping RS-03984975 - Cutadapt output already exists for R1 and R2 Skipping RS-03984976 - Cutadapt output already exists for R1 and R2 Skipping RS-03984977 - Cutadapt output already exists for R1 and R2 Skipping RS-03984978 - Cutadapt output already exists for R1 and R2 Skipping RS-03984979 - Cutadapt output already exists for R1 and R2 Skipping RS-03985011 - Cutadapt output already exists for R1 and R2 Skipping RS-03985012 - Cutadapt output already exists for R1 and R2 Skipping RS-03985013 - Cutadapt output already exists for R1 and R2 Skipping RS-03985014 - Cutadapt output already exists for R1 and R2 Skipping RS-03985015 - Cutadapt output already exists for R1 and R2 Skipping RS-03985016 - Cutadapt output already exists for R1 and R2 Skipping RS-03985017 - Cutadapt output already exists for R1 and R2 Skipping RS-03985018 - Cutadapt output already exists for R1 and R2 Skipping RS-03985019 - Cutadapt output already exists for R1 and R2 Skipping RS-03985020 - Cutadapt output already exists for R1 and R2 Skipping RS-03985021 - Cutadapt output already exists for R1 and R2 Skipping RS-03985022 - Cutadapt output already exists for R1 and R2 Skipping RS-03985023 - Cutadapt output already exists for R1 and R2 Skipping RS-03985024 - Cutadapt output already exists for R1 and R2 Skipping RS-03985025 - Cutadapt output already exists for R1 and R2 Skipping RS-03985026 - Cutadapt output already exists for R1 and R2 Skipping RS-03985027 - Cutadapt output already exists for R1 and R2 Skipping RS-03985028 - Cutadapt output already exists for R1 and R2 Cutadapt for all R1 and R2 files completed.
Quality score trimming (Trimmomatic) ¶
InĀ [19]:
# Create a list to store the Trimmomatic subprocess instances
trimm_processes = []
# Iterate over the RS-* folders
for folder in folders:
folder_name = os.path.basename(folder)
r1_files = glob.glob(os.path.join(output_cut_dir, folder_name, "*R1_001.trimmed.fastq.gz"))[0]
r2_files = glob.glob(os.path.join(output_cut_dir, folder_name, "*R2_001.trimmed.fastq.gz"))[0]
if not r1_files or not r2_files:
print(f"Skipping {os.path.basename(folder)} - R1 or R2 files not found.")
continue
# Create a subfolder for the current sample's FastQC output
sample_trim_dir = os.path.join(output_trim_dir, folder_name)
os.makedirs(sample_trim_dir, exist_ok=True)
# Define input and output file paths
r1_files_name = os.path.basename(r1_files).split('.')[0]
r2_files_name = os.path.basename(r2_files).split('.')[0]
r1_output_paired = os.path.join(sample_trim_dir, f"{r1_files_name}.trim_paired.fastq.gz")
r2_output_paired = os.path.join(sample_trim_dir, f"{r2_files_name}.trim_paired.fastq.gz")
r1_output_unpaired = os.path.join(sample_trim_dir, f"{r1_files_name}.trim_unpaired.fastq.gz")
r2_output_unpaired = os.path.join(sample_trim_dir, f"{r2_files_name}.trim_unpaired.fastq.gz")
# Check if Trimmomatic output already exists for R1 and R2 files
if os.path.exists(r1_output_paired) and os.path.exists(r2_output_paired) and \
os.path.exists(r1_output_unpaired) and os.path.exists(r2_output_unpaired):
print(f"Skipping {folder_name} - Trimmomatic output already exists for R1 and R2")
else:
# Create a log directory inside the output_trim_dir
log_directory = os.path.join(output_trim_dir, "trimmomatic_logs")
os.makedirs(log_directory, exist_ok=True)
# Define log file path for stdout
stdout_log_path = os.path.join(log_directory, f"{folder_name}_stdout.log")
# Run TrimmomaticPE for R1 and R2 files
command = [
"TrimmomaticPE",
"-trimlog", stdout_log_path,
r1_files, r2_files,
r1_output_paired, r1_output_unpaired,
r2_output_paired, r2_output_unpaired,
"LEADING:20",
"TRAILING:20",
"SLIDINGWINDOW:4:15",
"MINLEN:25"
]
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
trimm_processes.append(process)
# Wait for all subprocesses to finish
for process in trimm_processes:
_, stderr_output = process.communicate()
if process.returncode != 0:
print(f"An error occurred: {stderr_output.decode('utf-8')}")
print("Trimmomatic for all R1 and R2 files completed.")
Skipping RS-03984980 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985010 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985029 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985030 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985031 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985032 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985033 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985034 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985035 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985036 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985037 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985038 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985039 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985040 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985041 - Trimmomatic output already exists for R1 and R2 Skipping RS-04068630 - Trimmomatic output already exists for R1 and R2 Skipping RS-04068631 - Trimmomatic output already exists for R1 and R2 Skipping RS-04068632 - Trimmomatic output already exists for R1 and R2 Skipping RS-04068633 - Trimmomatic output already exists for R1 and R2 Skipping RS-04068634 - Trimmomatic output already exists for R1 and R2 Skipping RS-04068635 - Trimmomatic output already exists for R1 and R2 Skipping RS-04068636 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984981 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984982 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984983 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984984 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984985 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984986 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984987 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984988 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984989 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984990 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984991 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984992 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984993 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984995 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984996 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984997 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984998 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984999 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985000 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985001 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985002 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985003 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985004 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985005 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985006 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985007 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985008 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985009 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984975 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984976 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984977 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984978 - Trimmomatic output already exists for R1 and R2 Skipping RS-03984979 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985011 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985012 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985013 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985014 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985015 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985016 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985017 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985018 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985019 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985020 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985021 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985022 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985023 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985024 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985025 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985026 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985027 - Trimmomatic output already exists for R1 and R2 Skipping RS-03985028 - Trimmomatic output already exists for R1 and R2 Trimmomatic for all R1 and R2 files completed.
FASTQ ā Post QC ¶
InĀ [20]:
# Create a list to store the FastQC subprocess instances
postfastqc_processes = []
# Specify the folder for FastQC output
folder_fastqc_dir = os.path.join(base_dir, f"{directory}_Post_FASTQC")
# Create the FastQC output directory if it doesn't exist
os.makedirs(folder_fastqc_dir, exist_ok=True)
# Iterate over the RS-* folders
for folder in folders:
folder_name = os.path.basename(folder)
r1_files = glob.glob(os.path.join(output_trim_dir, folder_name, "*R1_001.trim_paired.fastq.gz"))[0]
r2_files = glob.glob(os.path.join(output_trim_dir, folder_name, "*R2_001.trim_paired.fastq.gz"))[0]
if not r1_files or not r2_files:
print(f"Skipping {os.path.basename(folder)} - R1 or R2 files not found.")
continue
# Create a subfolder for the current sample's FastQC output
sample_fastqc_dir = os.path.join(folder_fastqc_dir, folder_name)
os.makedirs(sample_fastqc_dir, exist_ok=True)
r1_files_name = os.path.basename(r1_files).split('.')[0]
r2_files_name = os.path.basename(r2_files).split('.')[0]
r1_exist = os.path.join(sample_fastqc_dir, f"{os.path.basename(r1_files_name)}.trim_paired_fastqc.html")
r2_exist = os.path.join(sample_fastqc_dir, f"{os.path.basename(r2_files_name)}.trim_paired_fastqc.html")
# Check if FastQC output already exists for R1 file
if os.path.exists(r1_exist):
print(f"Skipping {folder_name} - FastQC output already exists for R1")
else:
# Run FastQC for R1 file
command_r1 = [
"fastqc",
"-f", "fastq",
"--extract",
"-t", "9",
"-o", sample_fastqc_dir,
r1_files
]
process_r1 = subprocess.Popen(command_r1, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
postfastqc_processes.append(process_r1)
# Check if FastQC output already exists for R2 file
if os.path.exists(r2_exist):
print(f"Skipping {folder_name} - FastQC output already exists for R2")
else:
# Run FastQC for R2 file
command_r2 = [
"fastqc",
"-f", "fastq",
"--extract",
"-t", "9",
"-o", sample_fastqc_dir,
r2_files
]
process_r2 = subprocess.Popen(command_r2, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
postfastqc_processes.append(process_r2)
# Wait for all subprocesses to finish and re-run ones with errors
for process in postfastqc_processes:
_, stderr_output = process.communicate()
if process.returncode != 0:
print(f"Error message: {stderr_output.decode('utf-8')}")
# Re-run the same FastQC command
re_run_process = subprocess.Popen(
process.args, # Re-run the same command
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
_, re_run_stderr = re_run_process.communicate()
if re_run_process.returncode != 0:
print("Failed to re-run FastQC.")
print("FastQC for all R1 and R2 files completed.")
Skipping RS-03984980 - FastQC output already exists for R1 Skipping RS-03984980 - FastQC output already exists for R2 Skipping RS-03985010 - FastQC output already exists for R1 Skipping RS-03985010 - FastQC output already exists for R2 Skipping RS-03985029 - FastQC output already exists for R1 Skipping RS-03985029 - FastQC output already exists for R2 Skipping RS-03985030 - FastQC output already exists for R1 Skipping RS-03985030 - FastQC output already exists for R2 Skipping RS-03985031 - FastQC output already exists for R1 Skipping RS-03985031 - FastQC output already exists for R2 Skipping RS-03985032 - FastQC output already exists for R1 Skipping RS-03985032 - FastQC output already exists for R2 Skipping RS-03985033 - FastQC output already exists for R1 Skipping RS-03985033 - FastQC output already exists for R2 Skipping RS-03985034 - FastQC output already exists for R1 Skipping RS-03985034 - FastQC output already exists for R2 Skipping RS-03985035 - FastQC output already exists for R1 Skipping RS-03985035 - FastQC output already exists for R2 Skipping RS-03985036 - FastQC output already exists for R1 Skipping RS-03985036 - FastQC output already exists for R2 Skipping RS-03985037 - FastQC output already exists for R1 Skipping RS-03985037 - FastQC output already exists for R2 Skipping RS-03985038 - FastQC output already exists for R1 Skipping RS-03985038 - FastQC output already exists for R2 Skipping RS-03985039 - FastQC output already exists for R1 Skipping RS-03985039 - FastQC output already exists for R2 Skipping RS-03985040 - FastQC output already exists for R1 Skipping RS-03985040 - FastQC output already exists for R2 Skipping RS-03985041 - FastQC output already exists for R1 Skipping RS-03985041 - FastQC output already exists for R2 Skipping RS-04068630 - FastQC output already exists for R1 Skipping RS-04068630 - FastQC output already exists for R2 Skipping RS-04068631 - FastQC output already exists for R1 Skipping RS-04068631 - FastQC output already exists for R2 Skipping RS-04068632 - FastQC output already exists for R1 Skipping RS-04068632 - FastQC output already exists for R2 Skipping RS-04068633 - FastQC output already exists for R1 Skipping RS-04068633 - FastQC output already exists for R2 Skipping RS-04068634 - FastQC output already exists for R1 Skipping RS-04068634 - FastQC output already exists for R2 Skipping RS-04068635 - FastQC output already exists for R1 Skipping RS-04068635 - FastQC output already exists for R2 Skipping RS-04068636 - FastQC output already exists for R1 Skipping RS-04068636 - FastQC output already exists for R2 Skipping RS-03984981 - FastQC output already exists for R1 Skipping RS-03984981 - FastQC output already exists for R2 Skipping RS-03984982 - FastQC output already exists for R1 Skipping RS-03984982 - FastQC output already exists for R2 Skipping RS-03984983 - FastQC output already exists for R1 Skipping RS-03984983 - FastQC output already exists for R2 Skipping RS-03984984 - FastQC output already exists for R1 Skipping RS-03984984 - FastQC output already exists for R2 Skipping RS-03984985 - FastQC output already exists for R1 Skipping RS-03984985 - FastQC output already exists for R2 Skipping RS-03984986 - FastQC output already exists for R1 Skipping RS-03984986 - FastQC output already exists for R2 Skipping RS-03984987 - FastQC output already exists for R1 Skipping RS-03984987 - FastQC output already exists for R2 Skipping RS-03984988 - FastQC output already exists for R1 Skipping RS-03984988 - FastQC output already exists for R2 Skipping RS-03984989 - FastQC output already exists for R1 Skipping RS-03984989 - FastQC output already exists for R2 Skipping RS-03984990 - FastQC output already exists for R1 Skipping RS-03984990 - FastQC output already exists for R2 Skipping RS-03984991 - FastQC output already exists for R1 Skipping RS-03984991 - FastQC output already exists for R2 Skipping RS-03984992 - FastQC output already exists for R1 Skipping RS-03984992 - FastQC output already exists for R2 Skipping RS-03984993 - FastQC output already exists for R1 Skipping RS-03984993 - FastQC output already exists for R2 Skipping RS-03984995 - FastQC output already exists for R1 Skipping RS-03984995 - FastQC output already exists for R2 Skipping RS-03984996 - FastQC output already exists for R1 Skipping RS-03984996 - FastQC output already exists for R2 Skipping RS-03984997 - FastQC output already exists for R1 Skipping RS-03984997 - FastQC output already exists for R2 Skipping RS-03984998 - FastQC output already exists for R1 Skipping RS-03984998 - FastQC output already exists for R2 Skipping RS-03984999 - FastQC output already exists for R1 Skipping RS-03984999 - FastQC output already exists for R2 Skipping RS-03985000 - FastQC output already exists for R1 Skipping RS-03985000 - FastQC output already exists for R2 Skipping RS-03985001 - FastQC output already exists for R1 Skipping RS-03985001 - FastQC output already exists for R2 Skipping RS-03985002 - FastQC output already exists for R1 Skipping RS-03985002 - FastQC output already exists for R2 Skipping RS-03985003 - FastQC output already exists for R1 Skipping RS-03985003 - FastQC output already exists for R2 Skipping RS-03985004 - FastQC output already exists for R1 Skipping RS-03985004 - FastQC output already exists for R2 Skipping RS-03985005 - FastQC output already exists for R1 Skipping RS-03985005 - FastQC output already exists for R2 Skipping RS-03985006 - FastQC output already exists for R1 Skipping RS-03985006 - FastQC output already exists for R2 Skipping RS-03985007 - FastQC output already exists for R1 Skipping RS-03985007 - FastQC output already exists for R2 Skipping RS-03985008 - FastQC output already exists for R1 Skipping RS-03985008 - FastQC output already exists for R2 Skipping RS-03985009 - FastQC output already exists for R1 Skipping RS-03985009 - FastQC output already exists for R2 Skipping RS-03984975 - FastQC output already exists for R1 Skipping RS-03984975 - FastQC output already exists for R2 Skipping RS-03984976 - FastQC output already exists for R1 Skipping RS-03984976 - FastQC output already exists for R2 Skipping RS-03984977 - FastQC output already exists for R1 Skipping RS-03984977 - FastQC output already exists for R2 Skipping RS-03984978 - FastQC output already exists for R1 Skipping RS-03984978 - FastQC output already exists for R2 Skipping RS-03984979 - FastQC output already exists for R1 Skipping RS-03984979 - FastQC output already exists for R2 Skipping RS-03985011 - FastQC output already exists for R1 Skipping RS-03985011 - FastQC output already exists for R2 Skipping RS-03985012 - FastQC output already exists for R1 Skipping RS-03985012 - FastQC output already exists for R2 Skipping RS-03985013 - FastQC output already exists for R1 Skipping RS-03985013 - FastQC output already exists for R2 Skipping RS-03985014 - FastQC output already exists for R1 Skipping RS-03985014 - FastQC output already exists for R2 Skipping RS-03985015 - FastQC output already exists for R1 Skipping RS-03985015 - FastQC output already exists for R2 Skipping RS-03985016 - FastQC output already exists for R1 Skipping RS-03985016 - FastQC output already exists for R2 Skipping RS-03985017 - FastQC output already exists for R1 Skipping RS-03985017 - FastQC output already exists for R2 Skipping RS-03985018 - FastQC output already exists for R1 Skipping RS-03985018 - FastQC output already exists for R2 Skipping RS-03985019 - FastQC output already exists for R1 Skipping RS-03985019 - FastQC output already exists for R2 Skipping RS-03985020 - FastQC output already exists for R1 Skipping RS-03985020 - FastQC output already exists for R2 Skipping RS-03985021 - FastQC output already exists for R1 Skipping RS-03985021 - FastQC output already exists for R2 Skipping RS-03985022 - FastQC output already exists for R1 Skipping RS-03985022 - FastQC output already exists for R2 Skipping RS-03985023 - FastQC output already exists for R1 Skipping RS-03985023 - FastQC output already exists for R2 Skipping RS-03985024 - FastQC output already exists for R1 Skipping RS-03985024 - FastQC output already exists for R2 Skipping RS-03985025 - FastQC output already exists for R1 Skipping RS-03985025 - FastQC output already exists for R2 Skipping RS-03985026 - FastQC output already exists for R1 Skipping RS-03985026 - FastQC output already exists for R2 Skipping RS-03985027 - FastQC output already exists for R1 Skipping RS-03985027 - FastQC output already exists for R2 Skipping RS-03985028 - FastQC output already exists for R1 Skipping RS-03985028 - FastQC output already exists for R2 FastQC for all R1 and R2 files completed.
Alignment ¶
Parallel alignment (Bowtie2) ¶
InĀ [21]:
# Create a list to store the subprocess instances
processes = []
# Iterate over the RS-* folders
for folder in folders:
folder_name = os.path.basename(folder)
r1_files = glob.glob(os.path.join(output_trim_dir, folder_name, "*R1_001.trim_paired.fastq.gz"))[0]
r2_files = glob.glob(os.path.join(output_trim_dir, folder_name, "*R2_001.trim_paired.fastq.gz"))[0]
folder_name = os.path.basename(folder)
output_sam = os.path.join(output_sam_dir, folder_name + ".sam")
output_bam = os.path.join(output_bam_dir, folder_name + ".bam")
sam_compress = os.path.join(base_dir, output_sam_dir, f"{directory}_sam_compressed.tar.gz")
bam_compress = os.path.join(base_dir, output_bam_dir, f"{directory}_bam_compressed.tar.gz")
# Check if SAM file already exists in the output directory
if os.path.exists(output_sam) or os.path.exists(output_bam) or os.path.exists(sam_compress) or os.path.exists(bam_compress):
print(f"Skipping {folder_name} - file already exists")
continue
command = [
"bowtie2",
"-x", reference_index_dir,
"-1", r1_files,
"-2", r2_files,
"-p", "9",
"-S", output_sam,
"--local",
"--very-sensitive-local",
"-I", "10",
"-X", "700",
"--dovetail",
"--no-unal",
"--no-mixed"
]
# Create a log file for subprocess output
log_file = os.path.join(output_sam_dir, f"{folder_name}_output.log")
log_file_handle = open(log_file, "w")
# Run the command in the background using subprocess.Popen
process = subprocess.Popen(command, stdout=log_file_handle, stderr=subprocess.STDOUT)
# Add the process and log file handle to the list
processes.append((process, log_file_handle))
# Monitor the real-time output of subprocesses
while processes:
finished_processes = []
for process, log_file_handle in processes:
return_code = process.poll()
if return_code is None:
# Process is still running
# Close the log file handle
log_file_handle.close()
# Reopen the log file in read mode
log_file_handle = open(log_file, "r")
# Read and process the latest output
output_lines = log_file_handle.readlines()
# Check if stdout is redirected correctly
if output_lines:
# Process and display the output
for line in output_lines:
print(line.strip())
# Optionally, you can update a progress indicator here
else:
# Skip printing output if stdout is correctly redirected
continue
else:
# Process has finished
finished_processes.append((process, log_file_handle))
# Remove finished processes from the list
for process, log_file_handle in finished_processes:
processes.remove((process, log_file_handle))
log_file_handle.close()
# Wait before checking the subprocesses again
time.sleep(1)
# Get the list of log files in folder.sam directory
log_files = glob.glob(os.path.join(output_sam_dir, "*_output.log"))
Skipping RS-03984980 - file already exists Skipping RS-03985010 - file already exists Skipping RS-03985029 - file already exists Skipping RS-03985030 - file already exists Skipping RS-03985031 - file already exists Skipping RS-03985032 - file already exists Skipping RS-03985033 - file already exists Skipping RS-03985034 - file already exists Skipping RS-03985035 - file already exists Skipping RS-03985036 - file already exists Skipping RS-03985037 - file already exists Skipping RS-03985038 - file already exists Skipping RS-03985039 - file already exists Skipping RS-03985040 - file already exists Skipping RS-03985041 - file already exists Skipping RS-04068630 - file already exists Skipping RS-04068631 - file already exists Skipping RS-04068632 - file already exists Skipping RS-04068633 - file already exists Skipping RS-04068634 - file already exists Skipping RS-04068635 - file already exists Skipping RS-04068636 - file already exists Skipping RS-03984981 - file already exists Skipping RS-03984982 - file already exists Skipping RS-03984983 - file already exists Skipping RS-03984984 - file already exists Skipping RS-03984985 - file already exists Skipping RS-03984986 - file already exists Skipping RS-03984987 - file already exists Skipping RS-03984988 - file already exists Skipping RS-03984989 - file already exists Skipping RS-03984990 - file already exists Skipping RS-03984991 - file already exists Skipping RS-03984992 - file already exists Skipping RS-03984993 - file already exists Skipping RS-03984995 - file already exists Skipping RS-03984996 - file already exists Skipping RS-03984997 - file already exists Skipping RS-03984998 - file already exists Skipping RS-03984999 - file already exists Skipping RS-03985000 - file already exists Skipping RS-03985001 - file already exists Skipping RS-03985002 - file already exists Skipping RS-03985003 - file already exists Skipping RS-03985004 - file already exists Skipping RS-03985005 - file already exists Skipping RS-03985006 - file already exists Skipping RS-03985007 - file already exists Skipping RS-03985008 - file already exists Skipping RS-03985009 - file already exists Skipping RS-03984975 - file already exists Skipping RS-03984976 - file already exists Skipping RS-03984977 - file already exists Skipping RS-03984978 - file already exists Skipping RS-03984979 - file already exists Skipping RS-03985011 - file already exists Skipping RS-03985012 - file already exists Skipping RS-03985013 - file already exists Skipping RS-03985014 - file already exists Skipping RS-03985015 - file already exists Skipping RS-03985016 - file already exists Skipping RS-03985017 - file already exists Skipping RS-03985018 - file already exists Skipping RS-03985019 - file already exists Skipping RS-03985020 - file already exists Skipping RS-03985021 - file already exists Skipping RS-03985022 - file already exists Skipping RS-03985023 - file already exists Skipping RS-03985024 - file already exists Skipping RS-03985025 - file already exists Skipping RS-03985026 - file already exists Skipping RS-03985027 - file already exists Skipping RS-03985028 - file already exists
Post-QC MultiQC analysis ¶
InĀ [22]:
# Specify the folder and file name for the MultiQC report
multiqc_output_dir = os.path.join(base_dir, f"{directory}_Post-QC_MultiQC")
os.makedirs(multiqc_output_dir, exist_ok=True)
multiqc_report_file = os.path.join(multiqc_output_dir, f"{directory}_Post-QC_multiqc_report.html")
# Generate the MultiQC report with --interactive flag
if os.path.exists(multiqc_report_file):
print("Skipping MultiQC - Report file already exists.")
else:
# Generate the MultiQC report with --interactive flag
multiqc_command = [
"multiqc",
"--interactive",
base_dir,
"-o", multiqc_output_dir,
"--filename", f"{directory}_Post-QC_multiqc_report.html"
]
subprocess.run(multiqc_command, check=True)
print("MultiQC report generation completed.")
# # Display the MultiQC report in the notebook
# display(HTML(filename=os.path.join(PATH, "multiqc_report.html")))
Skipping MultiQC - Report file already exists. MultiQC report generation completed.
Merging alignment logs ¶
InĀ [23]:
# Create an empty list to store the data
data = []
# Process each log file
for log_file_path in log_files:
log_file_name = os.path.basename(log_file_path)
filename = os.path.splitext(log_file_name)[0]
filename = filename.split("_")[0]
totalread = alignrate = concordant = multiple = discordant = None # Initialize the variables
with open(log_file_path) as f:
for line in f.readlines():
line = line.strip()
if 'reads; of these:' in line:
totalread = line.split(' ')[0]
elif 'overall alignment rate' in line:
alignrate = line.split(' ')[0]
elif 'aligned concordantly exactly 1 time' in line:
concordant = line.split(' ')[0]
elif 'aligned concordantly >1 times' in line:
multiple = line.split(' ')[0]
elif 'aligned discordantly 1 time' in line:
discordant = line.split(' ')[0]
data.append([filename, totalread, alignrate, concordant, multiple, discordant])
# Create a DataFrame from the collected data
df = pd.DataFrame(data, columns=['Filename', 'TotalRead', 'AlignRate', 'Concordant', 'Multiple', 'Discordant'])
# Write the DataFrame to a CSV file
output_path = os.path.join(output_sam_dir, f'{directory}_summary.csv')
df.to_csv(output_path, index=False)
Alignment summary ¶
InĀ [24]:
# Read the table from XLSX
data_summary = pd.read_csv(f"{output_path}").sort_values(by='Filename')
# Replace Filename values with sample_key.values() if matched with sample_key.keys()
for key, values in sample_key.items():
data_summary.loc[data_summary["Filename"] == key, "Filename"] = values[0]
# Remove the percentage sign from AlignRate and convert to numeric format
data_summary['AlignRate'] = data_summary['AlignRate'].str.rstrip('%').astype(float)
# Create subplots with adjustable figure size
fig, axes = plt.subplots(3, 1, figsize=(len(data_summary) * 3, len(data_summary) * 2))
# Use dedicated color for samples (if not in drug_color_map, then assign random colors)
sample_color_list = {filename: drug_color_map[filename.split("-")[1]] for filename in data_summary['Filename']}
# Bar plot of TotalRead
sns.barplot(ax=axes[0], x='Filename', y='TotalRead', data=data_summary, palette=sample_color_list)
axes[0].set_xlabel('Samples')
axes[0].set_ylabel('TotalRead')
axes[0].set_title('Samples TotalRead', fontsize=(len(data_summary) * 2)) # Adjust the title font size
axes[0].tick_params(axis='both', labelsize=(len(data_summary) * 1)) # Adjust the X and Y axis tick font size
# Add data labels to the middle of the bars
for p in axes[0].patches:
axes[0].annotate(f'{p.get_height():,.0f}', (p.get_x() + p.get_width() / 2, p.get_height()),
ha='center', va='center', xytext=(0, 5), textcoords='offset points', fontsize=(len(data_summary) * 1), rotation=90)
# Grouped bar plot of Concordant, Multiple, and Discordant
axes[1].set_ylim(0, max(data_summary['TotalRead']))
axes[1].bar(data_summary['Filename'], data_summary['Concordant'], label='Concordant', color='#C0C0C0')
axes[1].bar(data_summary['Filename'], data_summary['Multiple'], label='Multiple',
bottom=data_summary['Concordant'], color='#808080')
axes[1].bar(data_summary['Filename'], data_summary['Discordant'], label='Discordant',
bottom=data_summary['Concordant'] + data_summary['Multiple'], color='#DCDCDC')
axes[1].set_xlabel('Samples')
axes[1].set_ylabel('Count')
axes[1].set_title('Concordant, Multiple, and Discordant Counts', fontsize=(len(data_summary) * 2)) # Adjust the title font size
axes[1].tick_params(axis='both', labelsize=(len(data_summary) * 1)) # Adjust the X and Y axis tick font size
axes[1].legend()
# Add data labels to the bars
for p in axes[1].patches:
width = p.get_width()
height = p.get_height()
x = p.get_x()
y = p.get_y()
if p.get_label() == 'Discordant':
label = f'{height / 1000:,.0f}' # Format the value in thousands
axes[1].annotate(f'{p.get_height():,.0f}', (p.get_x() + p.get_width() / 2, p.get_height()),
ha='center', va='center', xytext=(0, 20), textcoords='offset points', fontsize=(len(data_summary) * 1))
else:
label = f'{height / 1000:,.0f}' # Format the value in thousands
axes[1].annotate(label, (x + width / 2, y + height / 2),
ha='center', va='center', fontsize=(len(data_summary) * 1))
# Bar plot of AlignRate
sns.barplot(ax=axes[2], x='Filename', y='AlignRate', data=data_summary, palette=sample_color_list)
axes[2].set_xlabel('Samples')
axes[2].set_ylabel('AlignRate')
axes[2].set_title('Samples AlignRate', fontsize=(len(data_summary) * 2)) # Adjust the title font size
axes[2].tick_params(axis='both', labelsize=(len(data_summary) * 1)) # Adjust the X and Y axis tick font size
axes[2].set_ylim(0, 100) # Set y-axis range from 0 to 100%
# Add data labels to the middle of the bars for AlignRate in percentage format
for p in axes[2].patches:
axes[2].annotate(f'{p.get_height()}%', (p.get_x() + p.get_width() / 2, p.get_height()),
ha='center', va='center', xytext=(0, 5), textcoords='offset points', fontsize=(len(data_summary) * 1), rotation=90)
# Rotate x-axis labels for all subplots
for ax in axes:
ax.tick_params(axis='x', labelrotation=90, size=(len(data_summary) * 3))
# Step 2: Adjust x-axis limits for axes 1
axes[1].set_xlim(-0.5, len(data_summary['Filename']) - 0.5)
# # Adjust spacing between subplots
# plt.tight_layout()
# Define the Read summary graph file path for graph storing
read_summary_path = os.path.join(graphs_files, f"{directory}_Read_Summary.svg")
plt.savefig(read_summary_path, format='svg', bbox_inches='tight', dpi=300) # Use bbox_inches='tight' to include all elements
print(f"{directory}_Read_Summary.svg saved to {read_summary_path}")
# plt.tight_layout()
plt.close()
# plt.show()
RQ023682_Read_Summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/RQ023682_Read_Summary.svg
Post-Alignment ¶
SAM files to BAM files (samtools) ¶
InĀ [25]:
# Get the list of sam files in folder.sam directory
sam_files = glob.glob(os.path.join(output_sam_dir, "*.sam"))
# Create the output .csv directory
output_csv = os.path.join(base_dir, directory + "_rc.csv")
os.makedirs(output_csv, exist_ok=True)
# Get the list of bam files in folder.bam directory
bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam"))
# Lock for synchronization
convert_lock = threading.Lock()
def convert_sam_to_bam(sam_file):
with convert_lock:
bam_file = os.path.join(output_bam_dir, os.path.basename(sam_file).replace(".sam", ".bam"))
if not os.path.exists(bam_file):
pysam.view("-bhS", "-o", bam_file, sam_file, catch_stdout=False)
print(f"Conversion completed for {sam_file} - BAM file: {bam_file}")
return bam_file
# Use ProcessPoolExecutor to run conversions in parallel
with ProcessPoolExecutor(max_workers=None) as executor:
bam_files = list(executor.map(convert_sam_to_bam, sam_files))
print("All conversions complete.")
All conversions complete.
Processing BAM files (samtools) ¶
InĀ [26]:
def run_subprocess(args, step_name, log_file, bam_file):
process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
try:
stdout, stderr = process.communicate()
return_code = process.returncode
log_file.write(f"Step {step_name} - STDOUT:\n")
log_file.write(stdout)
log_file.write("\n")
log_file.write(f"Step {step_name} - STDERR:\n")
log_file.write(stderr)
log_file.write("\n")
# Check for expected merge message in stderr
expected_merge_message = "[bam_sort_core] merging from"
if expected_merge_message in stderr:
# print(f"Expected merge message found in {step_name} for {bam_file}")
return_code = 0 # Consider it as a success
if return_code != 0:
log_file.write(f"Step {step_name} - Error Return Code: {return_code}\n")
log_file.write(f"Error in {step_name} - {stderr}\n")
print(f"Error in {step_name} for {bam_file}")
return return_code
except:
process.terminate()
raise
MAX_RETRY_COUNT = 1
def process_bam(bam_file, log_dir):
bam_name = os.path.basename(bam_file)
bam_name_no_extension = os.path.splitext(bam_name)[0]
log_file_path = os.path.join(log_dir, f"{bam_name_no_extension}_log.txt")
# Define the output_processed_bam_dir
output_processed_bam_dir = os.path.join(base_dir, directory + "_processed_bam")
def run_subprocess_with_retries(args, step_name, log_file):
for retry in range(MAX_RETRY_COUNT):
return_code = run_subprocess(args, step_name, log_file)
if return_code == 0:
break
else:
print(f"Error in {step_name} for {bam_file}, Retry {retry+1}/{MAX_RETRY_COUNT}")
if return_code != 0:
print(f"Max retry reached for {step_name} - {bam_file}")
return return_code
def log_step_output(step_name, return_code):
with open(log_file_path, "a") as log_file:
log_file.write(f"Step {step_name} - Return Code: {return_code}\n")
# Step 1: Remove unmapped reads
unmapped_removed_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_unmapped_removed.bam")
with open(log_file_path, "a") as log_file:
step_name = "1 - Remove unmapped reads"
return_code = run_subprocess(
["samtools", "view", "-@9", "-h", "-F", "4", "-b", "-o", unmapped_removed_bam_file, bam_file],
step_name, log_file, bam_file
)
log_step_output(step_name, return_code)
if return_code != 0:
print(f"Error in {step_name} for {bam_file}")
return return_code
# Step 2: Sort by name
sorted_name_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_sorted_name.bam")
with open(log_file_path, "a") as log_file:
step_name = "2 - Sort by name"
return_code = run_subprocess(
["samtools", "sort", "-@9", "-m", "2G", "-n", "-o", sorted_name_bam_file, unmapped_removed_bam_file],
step_name, log_file, bam_file
)
if return_code != 0:
print(f"Error in {step_name} for {bam_file}")
return
# Step 3: Add/correct mate pair information
fixmate_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_fixmate.bam")
with open(log_file_path, "a") as log_file:
step_name = "3 - Add/correct mate pair information"
return_code = run_subprocess(
["samtools", "fixmate", "-@9", "-m", sorted_name_bam_file, fixmate_bam_file],
step_name, log_file, bam_file
)
log_step_output(step_name, return_code)
if return_code != 0:
print(f"Error in {step_name} for {bam_file}")
return
# Step 4: Sort by genome coordinate
sorted_coord_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_sorted_coord.bam")
with open(log_file_path, "a") as log_file:
step_name = "4 - Sort by genome coordinate"
return_code = run_subprocess(
["samtools", "sort", "-@9", "-m", "2G", "-o", sorted_coord_bam_file, fixmate_bam_file],
step_name, log_file, bam_file
)
log_step_output(step_name, return_code)
if return_code != 0:
print(f"Error in {step_name} for {bam_file}")
return
# Step 5: Mark duplicates
marked_duplicates_bam_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}_processed.bam")
with open(log_file_path, "a") as log_file:
step_name = "5 - Mark duplicates"
return_code = run_subprocess(
["samtools", "markdup", "-@9", sorted_coord_bam_file, marked_duplicates_bam_file],
step_name, log_file, bam_file
)
log_step_output(step_name, return_code)
if return_code != 0:
print(f"Error in {step_name} for {bam_file}")
return
# Step 6: Index the aligned BAM file
index_file = os.path.join(output_processed_bam_dir, f"{bam_name_no_extension}.bai")
with open(log_file_path, "a") as log_file:
step_name = "6 - Index the aligned BAM file"
return_code = run_subprocess(
["samtools", "index", marked_duplicates_bam_file],
step_name, log_file, bam_file
)
log_step_output(step_name, return_code)
if return_code != 0:
print(f"Error in {step_name} for {bam_file}")
return
# Clean up intermediate files from previous steps
os.remove(unmapped_removed_bam_file)
os.remove(sorted_name_bam_file)
os.remove(fixmate_bam_file)
os.remove(sorted_coord_bam_file)
# print(f"Processing complete for {bam_file}")
def main():
bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam"))
output_processed_bam_dir = os.path.join(base_dir, directory + "_processed_bam")
log_dir = os.path.join(output_processed_bam_dir, "log") # Define and initialize log_dir
os.makedirs(log_dir, exist_ok=True)
errored_files = []
with concurrent.futures.ProcessPoolExecutor() as executor:
for return_code, bam_file in zip(executor.map(process_bam, bam_files, [log_dir] * len(bam_files)), bam_files):
if return_code != 0:
errored_files.append(bam_file)
# # Retry processing for errored files
# for bam_file in errored_files:
# print(f"Retrying processing for {bam_file}")
# process_bam(bam_file, log_dir)
print("All processes complete.")
if __name__ == "__main__":
try:
# Check if processed BAM files already exist and skip processing
bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam"))
output_processed_bam_dir = os.path.join(base_dir, directory + "_processed_bam")
processed_bam_files = [os.path.join(output_processed_bam_dir, f"{os.path.splitext(os.path.basename(bam))[0]}_processed.bam")
for bam in bam_files]
bam_files_to_process = [bam_file for bam_file, processed_bam_file in zip(bam_files, processed_bam_files)
if not os.path.exists(processed_bam_file)]
if bam_files_to_process:
print(f"Processing {len(bam_files_to_process)} BAM files.")
main() # Call the main processing function
else:
print("All BAM files are already processed. Skipping.")
except KeyboardInterrupt:
print("Execution interrupted.")
All BAM files are already processed. Skipping.
Collect read count from processed bam files ¶
InĀ [27]:
# Define processing_bam_dir variables
processing_bam_dir = os.path.join(base_dir, directory + "_processed_bam")
processing_bam = glob.glob(os.path.join(processing_bam_dir, "*_processed.bam"))
# Lock for synchronization
collect_lock = threading.Lock()
def collect_read_counts_and_save(bam_file):
mapq_threshold = 30
read_count = {}
with pysam.AlignmentFile(bam_file, "rb") as sorted_bam:
for read in sorted_bam:
if (
# read.flag == 0
# read.is_proper_pair # Check if the read is part of a proper pair
# and not read.mate_is_unmapped # Check if the mate is mapped
read.mapping_quality >= mapq_threshold
):
reference_name = sorted_bam.get_reference_name(read.reference_id)
if reference_name != "*" and not read.is_unmapped:
read_count[reference_name] = read_count.get(reference_name, 0) + 1
# Construct paths for output BAM and _rc.csv files
bam_name = os.path.basename(bam_file)
bam_name_no_extension = os.path.splitext(bam_name)[0]
bam_name_no_extension = bam_name_no_extension.replace("_processed", "")
output_csv_file = os.path.join(output_csv, f"{bam_name_no_extension}_rc.csv")
# Save the read count data to CSV file
with collect_lock:
with open(output_csv_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['ID', bam_name_no_extension])
for key, value in read_count.items():
writer.writerow([key, value])
# print(f"Processed {bam_file} and saved the result to {output_csv_file}")
# Use ProcessPoolExecutor to run read count collection and saving in parallel
with ProcessPoolExecutor(max_workers=None) as executor:
executor.map(collect_read_counts_and_save, processing_bam)
print("All read count collection and saving processes complete.")
All read count collection and saving processes complete.
Merging samples read counts ¶
InĀ [28]:
# In case, wanna run only this part
output_csv = os.path.join(base_dir, directory+ "_rc.csv")
# Retrieving fasta_index file
merged_data = pd.read_csv(fasta_file)
# Get the list of CSV files in the folder
csv_files = glob.glob(os.path.join(output_csv, "*.csv"))
# Sort the CSV files list in ascending order
csv_files.sort()
# Process each CSV file
for csv_file in csv_files:
sam_file_name = os.path.splitext(os.path.basename(csv_file))[0]
sam_file_name = sam_file_name.split('_')[0]
df = pd.read_csv(csv_file)
df = df.rename(columns={"Read count": sam_file_name})
merged_data = pd.merge(merged_data, df, on=['ID'], how='outer')
# Sort the merged data by ID in reverse order
merged_data = merged_data.sort_values(by='ORF_ID')
# Remove the 'ID' column
merged_data = merged_data.drop('ID', axis=1)
# Rename the headers based on the sample_key dictionary
for key, value in sample_key.items():
if key in merged_data.columns:
new_header = '_'.join(value) if isinstance(value, list) else value
merged_data = merged_data.rename(columns={key: new_header})
# Define the output file path for the merged XLSX
merged_xlsx_path = os.path.join(database_files, f"{directory}_merged.xlsx")
# Save the merged data to an XLSX file
merged_data.to_excel(merged_xlsx_path, index=False)
print(f"Merged XLSX file saved to {merged_xlsx_path}")
Merged XLSX file saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/RQ023682_merged.xlsx
SAM & BAM Files compression (gzip, tar) ¶
InĀ [29]:
def compress_and_remove(file):
with open(file, 'rb') as f_in:
with gzip.open(file + '.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(file)
def compress_and_create_tar(files, tar_filename, folder_name):
with tarfile.open(tar_filename, 'w:gz') as tar:
for file in files:
compressed_filename = file + '.gz'
with gzip.open(compressed_filename, 'wb') as f_out:
with open(file, 'rb') as f_in:
shutil.copyfileobj(f_in, f_out)
tar.add(compressed_filename, arcname=os.path.join(folder_name, os.path.basename(compressed_filename)))
os.remove(compressed_filename)
if __name__ == '__main__':
compression_handle = " " # Set this to "Yes" to execute the compression and archiving
if compression_handle == "Yes":
sam_files = glob.glob(os.path.join(output_sam_dir, "*.sam"))
bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam"))
with ProcessPoolExecutor() as executor:
completed_tasks = 0
total_tasks = len(sam_files) + len(bam_files)
for result in executor.map(compress_and_remove, sam_files + bam_files):
completed_tasks += 1
print(f'Compressing: {completed_tasks}/{total_tasks}', end='\r')
compressed_sam_files = glob.glob(os.path.join(output_sam_dir, "*.sam.gz"))
compressed_bam_files = glob.glob(os.path.join(output_bam_dir, "*.bam.gz"))
if not os.path.exists(os.path.join(base_dir, output_sam_dir, f"{directory}_sam_compressed.tar.gz")):
compress_and_create_tar(
compressed_sam_files,
os.path.join(base_dir, output_sam_dir, f"{directory}_sam_compressed.tar.gz"),
output_sam_dir
)
if not os.path.exists(os.path.join(base_dir, output_bam_dir, f"{directory}_bam_compressed.tar.gz")):
compress_and_create_tar(
compressed_bam_files,
os.path.join(base_dir, output_bam_dir, f"{directory}_bam_compressed.tar.gz"),
output_bam_dir
)
for file in compressed_sam_files:
os.remove(file)
for file in compressed_bam_files:
os.remove(file)
else:
print("Compression handle is not set")
Compression handle is not set
SAM & BAM Files decompression (optional) ¶
InĀ [30]:
# Set the decompression handle here
decompression_handle = " " # Set this to "yes" to enable decompression
def decompress_and_remove_tar(tar_filename, output_dir):
with tarfile.open(tar_filename, 'r:gz') as tar:
tar.extractall(output_dir)
os.remove(tar_filename)
def decompress_gz_file(gz_file):
with gzip.open(gz_file, 'rb') as f_in:
original_filename = gz_file[:-3] # Remove .gz extension
with open(original_filename, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(gz_file)
if __name__ == '__main__':
# Specify the paths to the tar.gz files and the output directories
sam_tar_file = os.path.join(base_dir, output_sam_dir, f"{directory}_sam_compressed.tar.gz")
bam_tar_file = os.path.join(base_dir, output_bam_dir, f"{directory}_bam_compressed.tar.gz")
sam_output_dir = output_sam_dir # Output directory for SAM files
bam_output_dir = output_bam_dir # Output directory for BAM files
if decompression_handle == "yes":
decompress_and_remove_tar(sam_tar_file, sam_output_dir)
decompress_and_remove_tar(bam_tar_file, bam_output_dir)
sam_gz_files = glob.glob(os.path.join(sam_output_dir, "*.sam.gz"))
bam_gz_files = glob.glob(os.path.join(bam_output_dir, "*.bam.gz"))
with ProcessPoolExecutor() as executor:
executor.map(decompress_gz_file, sam_gz_files)
executor.map(decompress_gz_file, bam_gz_files)
else:
print("Decompression handle is not set")
Decompression handle is not set
Normalization ¶
Installing R packages (optional) ¶
InĀ [31]:
# Set install_R_handle to "yes" to enable package installation
install_R_handle = ""
if install_R_handle == "yes":
# Install required packages
# robjects.r('install.packages("BiocManager", repos="http://cran.r-project.org")')
# Set the C++ compiler version
os.environ['CXX'] = 'g++-9'
# Install packages for normalization
bioc_manager = robjects.packages.importr("BiocManager")
bioc_manager.install("limma")
bioc_manager.install("edgeR")
robjects.r('install.packages("DescTools")')
else:
print("Install R Handle is not set")
Install R Handle is not set
InĀ [32]:
from rpy2.robjects.packages import importr
from rpy2.robjects import r
# Load packages
base = importr('base')
limma = importr("limma")
edgeR = importr("edgeR")
# Check versions
limma_version = r('packageVersion("limma")')
edgeR_version = r('packageVersion("edgeR")')
print("limma version:", limma_version[0])
print("edgeR version:", edgeR_version[0])
limma version: [1] 3 58 1 edgeR version: [1] 4 0 16
Import R packages ¶
InĀ [33]:
# Activate R's base package
base = importr('base')
# Load packages
limma = importr("limma")
edgeR = importr("edgeR")
stats = importr("stats")
Desctools = importr("DescTools")
graphics = importr("graphics")
#Cheking current library Path
robjects.r(".libPaths()")
# Set R's working directory
robjects.r(f'setwd("{base_dir}")')
Out[33]:
array(['/media/kimlab/DATA1/harryjo/Reference/9.1_delta'], dtype='<U47')
Import functions ¶
InĀ [34]:
# Changing pandas dataframe to R dataframe
def df_to_r_dataframe(df):
with (robjects.default_converter + pandas2ri.converter).context():
r_df = robjects.conversion.get_conversion().py2rpy(df)
return r_df
# Limma quantile normaization
def limma_normalizeQuantiles(r_dataframe, ties=False):
normalized_data = limma.normalizeQuantiles(r_dataframe, ties=False)
return normalized_data
# Winsorization
lower_quantile = 0.00001
upper_quantile = 0.99999
def winsorize_func(r_dataframe, lower_quantile, upper_quantile):
colnames = list(r_dataframe.colnames)
robjects.r('winsorized_cols <- list()')
for col in colnames:
col_str = str(col)
robjects.r.assign("col_data", r_dataframe.rx2(col_str))
robjects.r(f'''
quantiles <- quantile(col_data, probs = c({lower_quantile}, {upper_quantile}), na.rm = TRUE)
winsorized_cols[["{col_str}"]] <- DescTools::Winsorize(
as.numeric(col_data),
val = quantiles
)
''')
result = robjects.r('as.data.frame(winsorized_cols, check.names = FALSE)')
# Convert R dataframe to pandas dataframe
result_pd_df = robjects.conversion.rpy2py(result)
# Reset index to start from 0
result_pd_df.reset_index(drop=True, inplace=True)
return result_pd_df
# Saving processed dataframe (without indices) with indices from fasta_data
# Path to the output CSV file
fasta_data = pd.read_csv(fasta_file)
# Cleaning out dAAVS1 and pDest from the dataframe
exclude_keywords = ["dAAVS1", "pDest"]
fasta_data = fasta_data[~fasta_data.apply(lambda row: any(keyword in str(row) for keyword in exclude_keywords), axis=1)]
# Changing ORF_ID columns to number and sort the dataframe, reset index
fasta_data.loc[:, "ORF_ID"] = pd.to_numeric(fasta_data["ORF_ID"], errors="coerce")
fasta_data = fasta_data.sort_values("ORF_ID").reset_index(drop=True)
# Remove unwanted columns
drop_columns = ["ID", "Length"]
fasta_data = fasta_data.drop(drop_columns, axis=1)
# Reset index to include inside dataframe
fasta_data = fasta_data.reset_index()
# Change dataframe type to float
cols_to_convert = fasta_data.columns[~fasta_data.columns.isin(['Group', 'Gene_Symbol'])]
fasta_data[cols_to_convert] = fasta_data[cols_to_convert].astype(float)
def save_dataframe(index_df: pd.DataFrame, analyzed_df: pd.DataFrame, output_path: str) -> pd.DataFrame:
"""
Save the merged DataFrame to an Excel file.
Parameters:
index_df (pd.DataFrame): DataFrame to be used as the index.
analyzed_df (pd.DataFrame): DataFrame to be analyzed and merged.
output_path (str): Output file path for saving the merged DataFrame.
Returns:
pd.DataFrame: The merged DataFrame.
"""
# Reset index for the analyzed_df
analyzed_df_indexed = analyzed_df.reset_index().astype(float)
# Perform the merge based on column indices
merged_df = index_df.merge(analyzed_df_indexed, on='index')
merged_df.replace(['nan'], np.nan, inplace=True)
merged_df.set_index('index', inplace=True)
merged_df.index.name = None
# Saving the merged DataFrame to xlsx
merged_df.to_excel(output_path, index=False)
print(f"DataFrame saved to {output_path}")
return merged_df
# For TMM / GeTMM normalization
def edgeR_normfactor(data, handle):
experimental_columns = data.columns
if handle == "Control":
# Define your control and experimental keywords
neg_control_keywords = ["DMSO", "Baseline"]
none_keywords = ["mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative", "Serumfree"]
# Determine group names based on column names
group_names = [
"Control" if any(keyword in col for keyword in neg_control_keywords)
else ("None" if any(keyword in col for keyword in none_keywords)
else "Experimental")
for col in experimental_columns
]
elif handle == "Triplet":
# Extract the second part of each column name after splitting by underscore
extracted_keywords = [col.split('-')[1] for col in experimental_columns]
group_names = extracted_keywords
group_factor = robjects.FactorVector(group_names)
data_raw_r = df_to_r_dataframe(data)
dge = edgeR.DGEList(counts=data_raw_r, group=group_factor)
dge = edgeR.calcNormFactors(dge, method="TMM")
dge_normfactors_r = dge.rx2('samples')
# dge_normfactor to Pandas dataframe
with (robjects.default_converter + pandas2ri.converter).context():
dge_normfactors_df = robjects.conversion.get_conversion().rpy2py(dge_normfactors_r)
norm_raw = edgeR.cpm(dge)
norm_log = edgeR.cpm(dge, log=True)
norm_colnames = list(data_raw_r.colnames)
return dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r
Gene count summary ¶
InĀ [35]:
# Read the table from XLSX
data_rc = pd.read_excel(f"{merged_xlsx_path}")
# Cleaning out dAAVS1 and pDest from the dataframe
exclude_keywords = ["dAAVS1", "pDest"]
gene_data = data_rc[~data_rc.apply(lambda row: any(keyword in str(row) for keyword in exclude_keywords), axis=1)]
# Select data based on header values
columns_to_include = [column for column in data_rc.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
gene_data = gene_data[columns_to_include]
# Create the sample_color_list using columns_to_include
sample_color_list = {column: drug_color_map[column.split("-")[1]] for column in columns_to_include}
# Count the number of genes per column
gene_counts = gene_data.count()
# Calculate gene count statistics
max_gene_count = gene_counts.max()
min_gene_count = gene_counts.min()
avg_gene_count = gene_counts.mean().astype(int)
half_of_max_count = int(max_gene_count / 2)
# Create a bar graph with customized colors and outline color
fig, ax = plt.subplots(figsize=(18, 9)) # Adjust the figure size as desired
gene_counts.plot(kind='bar',
color=[sample_color_list.get(column) for column in gene_counts.index],
edgecolor=[sample_color_list.get(column) for column in gene_counts.index],
ax=ax)
ax.set_title('Number of Genes per Samples', fontsize=16)
ax.set_xlabel('Samples', fontsize=12)
ax.set_ylabel('Number of Genes', fontsize=12)
ax.tick_params(axis='y', labelsize=12) # Set font size for y-axis labels
ax.tick_params(axis='x', labelsize=12) # Set font size for x-axis labels
# Rotate x-axis labels if needed
plt.xticks(rotation=90)
# Marking any y-axis that has less than half of the maximum count
plt.axhline(y= half_of_max_count, color='black', linewidth=3)
# Set the background color and gridlines
ax.set_facecolor('white')
ax.grid(False)
# Define the Gene Count graph file path for graph storing
Gene_count_path = os.path.join(graphs_files, f"{directory}_Gene_Count.svg")
plt.savefig(Gene_count_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_Gene_Count.svg saved to {Gene_count_path}")
# Print statistics
print("Maximum Gene Count:", max_gene_count)
print("Half of Maximum Gene Count:", half_of_max_count)
print("Minimum Gene Count:", min_gene_count)
print("Average Gene Count:", avg_gene_count)
plt.close()
# plt.show()
RQ023682_Gene_Count.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/RQ023682_Gene_Count.svg Maximum Gene Count: 15064 Half of Maximum Gene Count: 7532 Minimum Gene Count: 1627 Average Gene Count: 13100
Read count normalization (EdgeR, Bioinfokit) ¶
InĀ [36]:
# Read the table from XLSX
data_rc = pd.read_excel(f"{merged_xlsx_path}")
# Cleaning out dAAVS1 and pDest from the dataframe
exclude_keywords = ["dAAVS1", "pDest"]
data_nor = data_rc[~data_rc.apply(lambda row: any(keyword in str(row) for keyword in exclude_keywords), axis=1)]
# Changing ORF_ID columns to number and sort the dataframe
data_nor.loc[:, "ORF_ID"] = pd.to_numeric(data_nor["ORF_ID"], errors="coerce")
data_nor = data_nor.sort_values("ORF_ID").reset_index(drop=True)
# Select data based on header values
columns_to_include = [column for column in data_nor.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
data_col = data_nor[columns_to_include]
# Add length for length normalization
gene_info = data_nor[["Gene_Symbol", "Length"]]
gene_merge = gene_info.merge(data_col, left_index=True, right_index=True)
# Set dataframes for gene expression normalziation
data_edgeR = data_col
data_bioinfokit = gene_merge
# norm function associated with scipy.
from bioinfokit.analys import norm
# Define the normalization method you want to use: 'CPM', 'TMM', 'GeTMM' or 'TPM', 'RPKM'
chosen_normalization = 'TMM' # Change this to your desired normalization method\
if chosen_normalization == 'CPM':
# Load your data and replace NaN with 0
data_cpm = data_edgeR.fillna(0)
# Convert pandas DataFrame to R data frame
data_raw_r = df_to_r_dataframe(data_cpm)
# Calculate CPM in R
norm_raw = edgeR.cpm(data_raw_r)
norm_log = edgeR.cpm(data_raw_r, log=True)
# Access column names
norm_colnames = list(data_raw_r.colnames)
elif chosen_normalization in ['GeTMM', 'TMM']:
if chosen_normalization == 'GeTMM':
# Load your data and replace NaN with 0 and drop Gene_Symbol
data_norm = data_bioinfokit.fillna(0)
data_norm = data_norm.drop(['Gene_Symbol'], axis=1)
# Adding length for the length normalization
data_norm['Length'] = data_norm['Length'] / 10**3
data_norm.iloc[:, 1:] = data_norm.iloc[:, 1:].div(data_norm['Length'], axis=0)
# Select data based on header values
columns_to_include = [col for col in data_bioinfokit.columns if any(any(sample in col for sample in samples) for samples in sample_key.values())]
data_norm = data_norm[columns_to_include]
elif chosen_normalization == 'TMM':
data_norm = data_edgeR.fillna(0)
# Set the GeTMM/TMM handle
getmm_tmm_handle = "Control" # Change this to your desired normalization factors
# Perform normalization and get results
dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r = edgeR_normfactor(data_norm, getmm_tmm_handle)
elif chosen_normalization in ['RPKM', 'TPM']:
if chosen_normalization == 'RPKM':
nm_method = 'rpkm'
elif chosen_normalization == 'TPM':
nm_method = 'tpm'
data_norm = data_bioinfokit.fillna(0)
# Convert 'Length' column to numeric (if it's not already)
data_norm['Length'] = pd.to_numeric(data_norm['Length'], errors='coerce')
# Make 'Gene_Symbol' column as the index column
data_norm.set_index('Gene_Symbol', inplace=True)
nm = norm()
getattr(nm, nm_method)(df=data_norm, gl='Length')
# Get the normalized DataFrame
nor_df = getattr(nm, f'{nm_method}_norm')
# Reset index back to default integer index
nor_df.reset_index(drop=True, inplace=True)
nor_raw = nor_df
# Calculate the logarithm of nor values (base 2, with a small constant added)
avoid_nan = 0.18050946883 # Mimic edgeR cpm(log=True)
nor_log = (np.log2(nor_raw + avoid_nan)).astype(float)
# Convert nor_raw and nor_log DataFrame to array
norm_raw = nor_raw.values
norm_log = nor_log.values
# Access column names
norm_colnames = nor_raw.columns.tolist()
else:
raise ValueError(f"Invalid normalization method: {chosen_normalization}. Choose 'CPM', 'TMM', 'GeTMM' or 'TPM', 'RPKM'.")
# cpm_raw to dataframe for storage
nor_raw_df = pd.DataFrame(data=np.where(norm_raw != 0, norm_raw, np.nan), columns=norm_colnames)
nor_log_df = pd.DataFrame(data=np.where(norm_log != 0, norm_log, np.nan), columns=norm_colnames)
# Saving the merged DataFrame
nor_raw_path = os.path.join(database_files_original, f"{directory}_nor_raw.xlsx")
nor_raw_compile_df = save_dataframe(fasta_data, nor_raw_df, nor_raw_path)
nor_log_path = os.path.join(database_files_original, f"{directory}_nor_log.xlsx")
nor_log_compile_df = save_dataframe(fasta_data, nor_log_df, nor_log_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_nor_raw.xlsx DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_nor_log.xlsx
Noise detection ¶
InĀ [37]:
# Histogram of log2 to determine proper cutoff
median_log2_nor = base.apply(norm_log, 1, stats.median)
graphics.hist(median_log2_nor)
expr_cutoff = -1.0
graphics.abline(v=expr_cutoff, col="red", lwd=3)
expr_count = base.sum(FloatVector(np.array(median_log2_nor) > expr_cutoff))[0]
# Plot histogram using Python
plt.figure(figsize=(18, 9)) # Adjust the figure size as desired
# Calculate the bin width and adjust the bar width and spacing
num_bins = 50
data_range = np.ptp(median_log2_nor)
bin_width = data_range / num_bins
bar_width = 3.0 * bin_width
bar_spacing = bin_width - bar_width
# Plot the histogram with adjusted bar width and spacing
plt.hist(median_log2_nor, bins=num_bins, range=(np.min(median_log2_nor), np.max(median_log2_nor)),
color='black', edgecolor='black', linewidth=0.5,
rwidth=bar_width, align='mid')
# Set the background color and gridlines
plt.gca().set_facecolor('lightgray')
plt.grid(color='white', linestyle='-', linewidth=0.5)
plt.axvline(x=expr_cutoff, color='red', linewidth=3)
plt.title('Histogram of log2 nor', fontsize=16)
plt.xlabel('Log2 nor', fontsize=16)
plt.ylabel('No of Genes in log2', fontsize=16)
plt.yscale('log')
plt.tick_params(axis='both', labelsize=12)
print("Total number of genes after the cutoff:", int(expr_count))
# Define the nor Histogram graph file path for graph storing
nor_histogram_path = os.path.join(graphs_files_original, f"{directory}_norm_histogram.svg")
plt.savefig(nor_histogram_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_nor_Histogram.svg saved to {nor_histogram_path}")
# plt.close()
plt.close()
Total number of genes after the cutoff: 14403 RQ023682_nor_Histogram.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_norm_histogram.svg
Noise removal from dataframe ¶
InĀ [38]:
# Convert median_log2_nor to a numpy array
median_log2_nor_np = np.array(median_log2_nor)
# Create a boolean mask based on the expression cutoff
mask = median_log2_nor_np > expr_cutoff
unmask = median_log2_nor_np <= expr_cutoff
# Get the indices where the mask is True
indices = np.where(mask)[0]
# Get the indices where the mask is not True
non_indices = np.where(unmask)[0]
# Subset nor_raw based on the indices
nor_clean = norm_raw[indices, :]
# Subset nor_raw based on not in the indices
nor_unclean = norm_raw[non_indices, :]
# Access column names
if chosen_normalization in ['RPKM', 'TPM']:
nor_colnames = nor_raw.columns.tolist()
elif chosen_normalization in ['CPM', 'TMM', 'GeTMM']:
nor_colnames = list(data_raw_r.colnames)
# nor_raw to dataframe for storage
nor_clean_df = pd.DataFrame(data=np.where(nor_clean != 0, nor_clean, np.nan), index=indices, columns=nor_colnames)
nor_unclean_df = pd.DataFrame(data=np.where(nor_unclean != 0, nor_unclean, np.nan), index=non_indices, columns=nor_colnames)
# Saving the merged DataFrame
nor_clean_path = os.path.join(database_files_original, f"{directory}_nor_clean.xlsx")
nor_clean_compile_df = save_dataframe(fasta_data, nor_clean_df, nor_clean_path)
# Saving the merged Dataframe
nor_unclean_path = os.path.join(database_files_original, f"{directory}_nor_unclean.xlsx")
nor_unclean_compile_df = save_dataframe(fasta_data, nor_unclean_df, nor_unclean_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_nor_clean.xlsx DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_nor_unclean.xlsx
Correlation matrix clustering (Pre-normalization) ¶
InĀ [39]:
# Get the column names from data_raw
col_names = nor_clean_df.columns.values.tolist()
# Calculate the correlation matrix in R
cor_matrix = stats.cor(nor_clean, use="everything", method ="pearson")
# Convert the correlation matrix to a NumPy array
cor_matrix_np = np.asarray(cor_matrix)
# Compute the condensed distance matrix
dist_matrix = pdist(cor_matrix_np)
# Perform hierarchical clustering with the condensed distance matrix
linkage_matrix = hierarchy.linkage(dist_matrix, method='average')
dendrogram_row = hierarchy.dendrogram(linkage_matrix, no_plot=True)
# Get the order of rows and columns from the dendrogram
order_row = dendrogram_row['leaves']
order_col = dendrogram_row['leaves']
# Reorder the correlation matrix based on the clustering
cor_matrix_ordered = cor_matrix_np[order_row][:, order_col]
# Perform hierarchical clusterisng and plot the heatmap with clustering
sns.set(font_scale=0.7)
g = sns.clustermap(cor_matrix_np, cmap='coolwarm', cbar_pos=(1, 0.2, 0.03, 0.5), cbar_kws={'label': 'Correlation'},
dendrogram_ratio=(0.1, 0.1), linewidths=0.5,
xticklabels=col_names, yticklabels=col_names)
# Rotate the x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)
# Set the title above the clustering
g.ax_heatmap.set_title('Heatmap of Correlation Matrix with Clustering', fontsize=16, pad=20, loc='center', y=1.15)
# Define the nor Histogram graph file path for graph storing
PCC_Heatmap_path = os.path.join(graphs_files_original, f"{directory}_PCC_Heatmap.svg")
plt.savefig(PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=300)
print(f"{directory}_PCC_Heatmap.svg saved to {PCC_Heatmap_path}")
# plt.close()
plt.close()
RQ023682_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_PCC_Heatmap.svg
Removing unnecessary columns ¶
InĀ [40]:
# List of strings you want to check for in the column names
unnecessary_to_remove = ["mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative", "Serumfree"]
# List comprehension to filter out columns containing any of the strings
column_removal = [col for col in nor_clean_compile_df.columns if any(string in col for string in unnecessary_to_remove)]
# Get the common elements between 'column_removal' and DataFrame columns using 'intersection'
columns_to_drop = set(column_removal).intersection(nor_clean_compile_df.columns)
# Drop the columns from the DataFrame
nor_clean_compile_df = nor_clean_compile_df.drop(columns=columns_to_drop)
# Saving the merged DataFrame
nor_clean_removed_path = os.path.join(database_files_original, f"{directory}_nor_clean_removed.xlsx")
nor_clean_compile_df.to_excel(nor_clean_removed_path, index=False)
Box & Violin Plot (Pre-normalization) ¶
InĀ [41]:
# Removing nan values
nor_clean_compile_df.replace(['nan'], np.nan, inplace=True)
# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = nor_clean_compile_df[columns_to_include]
sample_data = sample_data.dropna(how='all').astype(float)
# Get the gene symbols corresponding to the data points
gene_symbols = nor_clean_compile_df.loc[sample_data.index, 'Gene_Symbol']
# Combine the sample data and gene symbols into a single DataFrame
data_melted = pd.concat([sample_data, gene_symbols], axis=1)
# Extract the sample names from the column headers
sample_names = sample_data.columns.str.split('_', expand=True).get_level_values(0)
# Melt the data for plotting
data_melted = data_melted.melt(id_vars=['Gene_Symbol'], value_vars=sample_names, var_name='Samples', value_name='Expression Level')
# Calculate the 10th and 90th percentile of the expression level for each sample
lower_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.1)
upper_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.9)
# Calculate the number of unique samples
num_unique_samples = len(data_melted['Samples'].unique())
# Define the sample names
include_baselline = name_list.copy()
new_value = ["Baseline", "mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"]
merged_samples_list = include_baselline + new_value
col_sample = [column for column in nor_clean_compile_df.columns if any(sample in column for sample in merged_samples_list)]
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample) - len(overlap_samples)
color_palette = sns.color_palette("Set1", n_colors=num_colors) # Or any other color palette you prefer
# Assign colors to the remaining samples in col_sample
col_colors = dict(zip(col_sample, color_palette))
sample_color_dict.update(col_colors)
# Calculate the figsize dynamically based on the number of unique samples
fig_width = min(12, 1.5 + num_unique_samples * 1)
fig_height = min(12, 1.5 + num_unique_samples * 0.5)
# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(fig_width, fig_height))
# Plot the box plot in the first subplot
sns.boxplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[0], palette=sample_color_dict)
axs[0].set_xscale('log')
axs[0].set_title('Whisker Box Plot normal data')
axs[0].set_xlabel('Log Expression Level')
axs[0].set_ylabel('Samples')
# Plot the violin plot in the second subplot
sns.violinplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[1], inner='quart',
# xticklabels=[f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)], # Use quantiles as x-axis labels
palette=sample_color_dict
)
axs[1].set_xscale('log')
axs[1].set_title('Violin Plot normal data')
axs[1].set_xlabel('Quantile Log Expression Level')
axs[1].set_ylabel('Samples')
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)])
# Define the nor Histogram graph file path for graph storing
Pre_Box_Violin_Plot_path = os.path.join(graphs_files_original, f"{directory}_Pre_Box_Violin_Plot.svg")
plt.savefig(Pre_Box_Violin_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_Box_Violin_Plot.svg saved to {Pre_Box_Violin_Plot_path}")
# Show the plot
plt.tight_layout()
plt.close()
# plt.show()
/tmp/ipykernel_1548459/199245737.py:73: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)])
RQ023682_Box_Violin_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Pre_Box_Violin_Plot.svg
Quantile Normalization (Limma) ¶
InĀ [42]:
# Removing first 5 columns
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
nor_clean_quant = nor_clean_compile_df[columns_to_include].astype(float)
# Convert pandas DataFrame to R data frame
nor_clean_r = df_to_r_dataframe(nor_clean_quant)
# Perform quantile normalization
nor_clean_quant_r = robjects.r['normalizeQuantiles'](nor_clean_r , ties=True)
# Change R DataFrame to pandas DataFrame
nor_clean_quant_df = robjects.conversion.rpy2py(nor_clean_quant_r)
# Saving the merged DataFrame
Qunatile_Path = os.path.join(database_files_original, f"{directory}_quatile_norm.xlsx")
Quantile_df = save_dataframe(fasta_data, nor_clean_quant_df, Qunatile_Path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_quatile_norm.xlsx
Box & Violin Plot (Post-Quantile Normalization) ¶
InĀ [43]:
# Removing nan values
Quantile_df.replace(['nan'], np.nan, inplace=True)
# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = Quantile_df[columns_to_include]
sample_data = sample_data.dropna(how='all').astype(float)
# Get the gene symbols corresponding to the data points
gene_symbols = Quantile_df.loc[sample_data.index, 'Gene_Symbol']
# Combine the sample data and gene symbols into a single DataFrame
data_melted = pd.concat([sample_data, gene_symbols], axis=1)
# Extract the sample names from the column headers
sample_names = sample_data.columns.str.split('_', expand=True).get_level_values(0)
# Melt the data for plotting
data_melted = data_melted.melt(id_vars=['Gene_Symbol'], value_vars=sample_names, var_name='Samples', value_name='Expression Level')
# Calculate the 5th and 95th percentile of the expression level
lower_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.05)
upper_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.95)
# Calculate the number of unique samples
num_unique_samples = len(data_melted['Samples'].unique())
# Define the sample names
include_baselline = name_list.copy()
new_value = "Baseline"
include_baselline.append(new_value)
col_sample = [column for column in nor_clean_compile_df.columns if any(sample in column for sample in include_baselline)]
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample) - len(overlap_samples)
color_palette = sns.color_palette("Set1", n_colors=num_colors) # Or any other color palette you prefer
# Assign colors to the remaining samples in col_sample
col_colors = dict(zip(col_sample, color_palette))
sample_color_dict.update(col_colors)
# Calculate the figsize dynamically based on the number of unique samples
fig_width = min(12, 1.5 + num_unique_samples * 1)
fig_height = min(12, 1.5 + num_unique_samples * 0.5)
# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(fig_width, fig_height))
# Plot the box plot in the first subplot
sns.boxplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[0], palette=sample_color_dict)
axs[0].set_xscale('log')
axs[0].set_title('Whisker Box Plot normal data')
axs[0].set_xlabel('Log Expression Level')
axs[0].set_ylabel('Samples')
# Plot the violin plot in the second subplot
sns.violinplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[1], inner='quart',
# xticklabels=[f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)], # Use quantiles as x-axis labels
palette=sample_color_dict
)
axs[1].set_xscale('log')
axs[1].set_title('Violin Plot normal data')
axs[1].set_xlabel('Quantile Log Expression Level')
axs[1].set_ylabel('Samples')
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)])
# Define the nor Histogram graph file path for graph storing
Pro_Box_Violin_Plot_path = os.path.join(graphs_files_original, f"{directory}_Pro_Box_Violin_Plot.svg")
plt.savefig(Pro_Box_Violin_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_Box_Violin_Plot.html saved to {Pro_Box_Violin_Plot_path}")
# Show the plot
plt.tight_layout()
plt.close()
# plt.show()
/tmp/ipykernel_1548459/3213361500.py:73: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)])
RQ023682_Box_Violin_Plot.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Pro_Box_Violin_Plot.svg
Batch correction (Limma) ¶
InĀ [44]:
# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_data = Quantile_df[columns_to_include]
batch_data = batch_data.dropna(how='all').astype(float)
# Extract the column names from the pandas DataFrame
col_headers = batch_data.columns.tolist()
# Convert pandas DataFrame to R data frame
nor_clean_quant_batch_r = df_to_r_dataframe(batch_data)
# Initialize variables
batch_assignment = []
current_batch = 0
# Function to get the batch name based on the current batch number
def get_batch_name(batch_number):
if batch_number == 0:
return "BatchBaseline"
else:
return "Batch" + str(batch_number)
# Assign batches based on the header pattern
for name in col_headers:
if "Baseline" in name:
current_batch += 1
batch_assignment.append(get_batch_name(current_batch))
# Convert the batch_assignment list to an R vector
batch = robjects.vectors.StrVector(batch_assignment)
# Check if batch is not empty
if current_batch > 1:
# Perform batch correction
batch_corrected_data = limma.removeBatchEffect(nor_clean_quant_batch_r, batch=batch, refbatch=1)
else:
# Skip batch correction
batch_corrected_data = Quantile_df
# Change R DataFrame to pandas DataFrame
batch_df = pd.DataFrame(batch_corrected_data, columns=col_headers)
batch_df = batch_df.set_index(batch_data.index)
# Saving the merged DataFrame
batch_compile_path = os.path.join(database_files_original, f"{directory}_batch_corrected.xlsx")
batch_compile_df = save_dataframe(fasta_data, batch_df, batch_compile_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Original_db/RQ023682_batch_corrected.xlsx
InĀ [45]:
Quantile_df
Out[45]:
| ORF_ID | NCBI | Group | Gene_Symbol | GC_Content | 1-Baseline-batch1 | 2-DMSO-A1 | 3-DMSO-B1 | 4-DMSO-C1 | 5-Paclitaxel-A | ... | 60-Vinblastine-A | 61-Vinblastine-B | 62-Vinblastine-C | 68-Baseline-batch5 | 69-DMSO-A | 70-DMSO-B | 71-DMSO-C | 72-TAS102-A | 73-TAS102-B | 74-TAS102-C | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0 | 1.0 | 805.0 | G06 | CALM2 | 39.111111 | 300.277868 | 210.868951 | 202.486238 | 197.923074 | 170.924696 | ... | 232.544281 | 112.761188 | 217.350953 | 192.534360 | 96.833947 | 116.643235 | 167.641227 | 132.742378 | 115.507831 | 79.060483 |
| 1.0 | 2.0 | 2629.0 | G02 | GBA | 55.245189 | 23.713128 | 23.491208 | 19.233388 | 18.550245 | 38.450861 | ... | 45.954597 | NaN | 12.364550 | 20.552707 | 22.234654 | 24.369918 | 20.680702 | 64.281363 | 33.614174 | 57.161873 |
| 2.0 | 3.0 | 10282.0 | G03 | BET1 | 38.375350 | 179.314316 | 197.271195 | 129.053012 | 204.098197 | 201.406659 | ... | 182.026519 | 251.641328 | 172.638726 | 118.573611 | 118.346581 | 100.196028 | 59.911706 | 139.666778 | 126.810903 | 79.453204 |
| 4.0 | 6.0 | 7178.0 | G02 | TPT1 | 44.123314 | 102.142005 | 221.469150 | 154.558439 | 107.023297 | 85.745020 | ... | 80.962702 | 70.946782 | 110.521912 | 73.799837 | 67.763080 | 90.346526 | 59.579791 | 81.550214 | 45.024986 | 42.189666 |
| 5.0 | 7.0 | 8089.0 | G01 | YEATS4 | 36.111111 | 138.201549 | 177.538561 | 154.558439 | 141.025297 | 219.543618 | ... | 152.368022 | 99.246231 | 65.690778 | 163.842039 | 125.789194 | 132.942906 | 129.226388 | 139.952415 | 70.684885 | 103.838296 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18383.0 | 100080862.0 | 653427.0 | delta | FOXD4L5 | 61.280000 | 20.325957 | 41.517837 | 27.221601 | 31.855617 | 35.936318 | ... | 10.649355 | NaN | 5.011689 | 14.701741 | 20.622145 | 32.329477 | 44.475515 | 22.947815 | 11.018509 | 13.262017 |
| 18384.0 | 100080864.0 | 389058.0 | delta | SP5 | 57.239627 | 35.858287 | 32.649168 | 31.565979 | 52.378266 | 64.008134 | ... | 32.490999 | NaN | 26.721827 | 33.971673 | 34.418664 | 31.355339 | 28.792104 | 43.922590 | 60.544982 | 39.774269 |
| 18385.0 | 100080865.0 | 642623.0 | delta | UBTFL1 | 63.686636 | 39.788269 | 14.195399 | 2.667069 | 21.536820 | 9.469249 | ... | 43.190103 | NaN | 22.666489 | 85.620338 | 67.466986 | 33.336147 | 63.747687 | 9.206605 | 24.672902 | 33.623282 |
| 18386.0 | 100080869.0 | 100131980.0 | delta | ZNF705G | 65.692308 | 24.680019 | 17.158396 | 37.622366 | 22.549472 | 16.422234 | ... | 38.541220 | NaN | 49.277263 | 32.431882 | 23.757016 | 25.840072 | 8.715556 | 37.857687 | 29.494166 | 30.927423 |
| 18387.0 | 100080871.0 | 7617.0 | delta | ZNF66 | 62.395076 | 1.206136 | NaN | NaN | 0.726026 | 1.297132 | ... | NaN | NaN | NaN | 5.406051 | 3.135368 | 2.494526 | 2.390596 | 1.583552 | 6.637757 | 1.868317 |
14403 rows Ć 73 columns
InĀ [46]:
batch_compile_df
Out[46]:
| ORF_ID | NCBI | Group | Gene_Symbol | GC_Content | 1-Baseline-batch1 | 2-DMSO-A1 | 3-DMSO-B1 | 4-DMSO-C1 | 5-Paclitaxel-A | ... | 60-Vinblastine-A | 61-Vinblastine-B | 62-Vinblastine-C | 68-Baseline-batch5 | 69-DMSO-A | 70-DMSO-B | 71-DMSO-C | 72-TAS102-A | 73-TAS102-B | 74-TAS102-C | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0 | 1.0 | 805.0 | G06 | CALM2 | 39.111111 | 272.637273 | 183.228356 | 174.845643 | 170.282479 | 143.284100 | ... | 227.906496 | 108.123403 | 212.713168 | 240.905687 | 145.205273 | 165.014561 | 216.012554 | 181.113705 | 163.879157 | 127.431810 |
| 1.0 | 2.0 | 2629.0 | G02 | GBA | 55.245189 | 28.122769 | 27.900849 | 23.643029 | 22.959886 | 42.860502 | ... | 36.421126 | NaN | 2.831079 | 18.588603 | 20.270550 | 22.405813 | 18.716598 | 62.317259 | 31.650070 | 55.197769 |
| 2.0 | 3.0 | 10282.0 | G03 | BET1 | 38.375350 | 190.436936 | 208.393815 | 140.175633 | 215.220817 | 212.529279 | ... | 144.025149 | 213.639959 | 134.637357 | 155.006645 | 154.779615 | 136.629062 | 96.344740 | 176.099812 | 163.243937 | 115.886238 |
| 4.0 | 6.0 | 7178.0 | G02 | TPT1 | 44.123314 | 87.667599 | 206.994744 | 140.084033 | 92.548891 | 71.270614 | ... | 82.369964 | 72.354044 | 111.929175 | 98.697817 | 92.661060 | 115.244506 | 84.477771 | 106.448194 | 69.922966 | 67.087645 |
| 5.0 | 7.0 | 8089.0 | G01 | YEATS4 | 36.111111 | 133.679273 | 173.016285 | 150.036163 | 136.503021 | 215.021342 | ... | 142.047378 | 88.925588 | 55.370135 | 190.302099 | 152.249255 | 159.402966 | 155.686448 | 166.412475 | 97.144945 | 130.298357 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18383.0 | 100080862.0 | 653427.0 | delta | FOXD4L5 | 61.280000 | 20.171238 | 41.363118 | 27.066882 | 31.700898 | 35.781599 | ... | 20.938496 | NaN | 15.300830 | 17.683353 | 23.603757 | 35.311088 | 47.457126 | 25.929427 | 14.000121 | 16.243628 |
| 18384.0 | 100080864.0 | 389058.0 | delta | SP5 | 57.239627 | 27.438108 | 24.228989 | 23.145799 | 43.958086 | 55.587954 | ... | 42.244861 | NaN | 36.475690 | 32.309260 | 32.756251 | 29.692926 | 27.129690 | 42.260177 | 58.882569 | 38.111856 |
| 18385.0 | 100080865.0 | 642623.0 | delta | UBTFL1 | 63.686636 | 38.413825 | 12.820956 | 1.292625 | 20.162376 | 8.094805 | ... | 43.746414 | NaN | 23.222800 | 75.820223 | 57.666871 | 23.536032 | 53.947572 | -0.593510 | 14.872787 | 23.823167 |
| 18386.0 | 100080869.0 | 100131980.0 | delta | ZNF705G | 65.692308 | 12.210429 | 4.688806 | 25.152777 | 10.079882 | 3.952644 | ... | 35.649124 | NaN | 46.385168 | 38.381735 | 29.706868 | 31.789924 | 14.665408 | 43.807539 | 35.444019 | 36.877275 |
| 18387.0 | 100080871.0 | 7617.0 | delta | ZNF66 | 62.395076 | 2.765502 | NaN | NaN | 2.285391 | 2.856497 | ... | NaN | NaN | NaN | 5.040778 | 2.770094 | 2.129252 | 2.025323 | 1.218279 | 6.272484 | 1.503043 |
14403 rows Ć 73 columns
Correlation matrix clustering (Post-normalization) ¶
InĀ [47]:
# Stats variable reconfirmed (Overlap between R and Scipy)
stats = importr("stats")
# Select the columns containing the sample data
batch_compile_df_corr = batch_compile_df.fillna(0)
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_compile_df_corr = batch_compile_df_corr[columns_to_include].astype(float)
# Convert pandas DataFrame to R data frame
with (robjects.default_converter + pandas2ri.converter).context():
batch_compile_df_corr_r = robjects.conversion.get_conversion().py2rpy(batch_compile_df_corr)
# Get the column names from data_raw
col_names = batch_compile_df_corr.columns.values.tolist()
# Calculate the correlation matrix in R
cor_matrix = stats.cor(batch_compile_df_corr_r, use="everything", method ="spearman")
# Convert the correlation matrix to a NumPy array
cor_matrix_np = np.asarray(cor_matrix)
# Compute the condensed distance matrix
dist_matrix = pdist(cor_matrix_np)
# Perform hierarchical clustering with the condensed distance matrix
linkage_matrix = hierarchy.linkage(dist_matrix, method='average')
dendrogram_row = hierarchy.dendrogram(linkage_matrix, no_plot=True)
# Get the order of rows and columns from the dendrogram
order_row = dendrogram_row['leaves']
order_col = dendrogram_row['leaves']
# Reorder the correlation matrix based on the clustering
cor_matrix_ordered = cor_matrix_np[order_row][:, order_col]
# Perform hierarchical clusterisng and plot the heatmap with clustering
sns.set(font_scale=0.7)
g = sns.clustermap(cor_matrix_np, cmap='coolwarm', cbar_pos=(1, 0.2, 0.03, 0.5), cbar_kws={'label': 'Correlation'},
dendrogram_ratio=(0.1, 0.1), linewidths=0.5,
xticklabels=col_names, yticklabels=col_names)
# Rotate the x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)
# Set the title above the clustering
g.ax_heatmap.set_title('Heatmap of Correlation Matrix with Clustering', fontsize=16, pad=20, loc='center', y=1.15)
# Define the CPM Histogram graph file path for graph storing
Post_PCC_Heatmap_path = os.path.join(graphs_files_original, f"{directory}_Post_PCC_Heatmap.svg")
plt.savefig(Post_PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=300)
print(f"{directory}_Post_PCC_Heatmap.svg saved to {PCC_Heatmap_path}")
plt.close()
# plt.show()
RQ023682_Post_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_PCC_Heatmap.svg
Interactive 3D PCA ¶
InĀ [536]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
# Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
# Customise marker size, line width, edge colour and grid colour
marker_size = 5
marker_linewidth = 0.2
marker_edgecolor = 'black'
color_grid = 'lightgray'
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
#Perform PCA with 3 components
pca = PCA(n_components=3)
pca_3d = pca.fit_transform(X)
#Assign data to dataframe with index (sample names)
pca_3d = pd.DataFrame(pca_3d, index=exp_table_norm_T.index)
#Assign the colounms with a color per samples
pca_3d.columns = ['PC%s (%s' % (i + 1, round(pca.explained_variance_ratio_[i] * 100, 2)) + '%)' for i in range(3)]
pca_3d['Colour'] = [sample_color_dict[sample] for sample in pca_3d.index]
#Finding the total variance if necessary
total_var = pca.explained_variance_ratio_.sum() * 100
# Create a figure object
fig = go.Figure()
# Iterate through each sample and add scatter plot trace to the figure
for sample in pca_3d.index:
pca_3d_trace = go.Scatter3d(
x=[pca_3d.loc[sample, pca_3d.columns[0]]], # Use the first component as 'x' values
y=[pca_3d.loc[sample, pca_3d.columns[1]]], # Use the second component as 'y' values
z=[pca_3d.loc[sample, pca_3d.columns[2]]], # Use the third component as 'z' values
mode='markers',
marker=dict(
size=5, # Adjust marker size
line=dict(width=marker_linewidth, color=marker_edgecolor),
color=pca_3d.loc[sample, 'Colour'], # Use the 'Colour' column for coloring
),
name=sample # Use the sample name as the legend label
)
fig.add_trace(pca_3d_trace)
# Update the layout of the 3D scatter plot
fig.update_layout(
width=1000,
height=750,
autosize=False,
margin=dict(l=70, r=20, b=100, t=50, pad=0),
template='plotly_white',
hovermode='closest',
scene=dict(
xaxis=dict(
title=pca_3d.columns[0], # Set x-axis title using the first component column name
gridcolor=color_grid,
zerolinecolor=color_grid,
showbackground=False,
backgroundcolor='rgb(230, 230,230)'
),
yaxis=dict(
title=pca_3d.columns[1], # Set y-axis title using the second component column name
gridcolor=color_grid,
zerolinecolor=color_grid,
showbackground=False,
backgroundcolor='rgb(230, 230, 230)'
),
zaxis=dict(
title=pca_3d.columns[2], # Set z-axis title using the third component column name
gridcolor=color_grid,
zerolinecolor=color_grid,
showbackground=False,
backgroundcolor='rgb(230, 230,230)'
),
aspectratio=dict(x=0.9, y=0.9, z=0.9),
aspectmode='manual'
),
legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)
fig.update_traces(marker=dict(size=marker_size, # Adjust marker size
line=dict(width=marker_linewidth, color=marker_edgecolor)),
selector=dict(mode='markers'))
fig.layout.font.family = 'Arial'
fig.layout.font.size = 15
fig.layout.font.color = 'black'
# Define the nor Histogram graph file path for graph storing
PCA_3D_Plot_path = os.path.join(graphs_files_original, f"{directory}_3D_PCA.html")
fig.write_html(PCA_3D_Plot_path)
print(f"{directory}_3D_PCA.html saved to {PCA_3D_Plot_path}")
# Show the figure inline in the notebook
# fig.show()
# Close the figure
plt.close()
RQ023682_3D_PCA.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_3D_PCA.html
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/_plotly_utils/basevalidators.py:2596: DeprecationWarning: *scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/
Interactive 2D PCA ¶
InĀ [49]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
#Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
#Customise marker size, line width, edge colour and grid colour
marker_size = 10
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform PCA with 2 components
pca = PCA(n_components=2,
random_state= 10000)
# Transformed data to lower dimension
pca_2d = pca.fit_transform(X)
# Assign data to dataframe with index (sample names)
pca_2d = pd.DataFrame(pca_2d, index=exp_table_norm_T.index)
# Assign the columns with a color per sample
pca_2d.columns = ['PCA%s (%s' % (i + 1, round(pca.explained_variance_ratio_[i] * 100, 2)) + '%)' for i in range(2)]
pca_2d['Colour'] = [sample_color_dict[sample] for sample in pca_2d.index]
color_discrete_map = {sample: pca_2d.loc[sample, 'Colour'] for sample in pca_2d.index}
fig = px.scatter(pca_2d,
x=pca_2d.columns[0],
y=pca_2d.columns[1],
color=pca_2d.index, # Use tsne_df.index for coloring
color_discrete_map=color_discrete_map, # Map index values to colors
hover_name=pca_2d.index)
fig.update_layout(
width=1000,
height=750,
autosize=False,
margin=dict(l=70, r=20, b=100, t=50, pad=0),
template='plotly_white',
hovermode='closest',
xaxis=dict(
showgrid=True,
gridcolor=color_grid,
zerolinecolor=color_grid,
),
yaxis=dict(
showgrid=True,
gridcolor=color_grid,
zerolinecolor=color_grid,
),
legend=dict(
title=dict(text='Samples') # Change the legend title to "Sample"
)
)
fig.update_traces(marker=dict(size=marker_size,
line=dict(width=marker_linewidth,
color=marker_edgecolor)),
selector=dict(mode='markers'))
fig.update_layout(font=dict(family='Arial', size=15, color='black'))
# Define the graph path for storing
PCA_2D_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_PCA.html")
fig.write_html(PCA_2D_Plot_path)
print(f"{directory}_2D_PCA.html saved to {PCA_2D_Plot_path}")
# Show the figure
# fig.show()
plt.close()
RQ023682_2D_PCA.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_PCA.html
Interactive 2D T-SEN ¶
InĀ [50]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
# Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
# Customise marker size, line width, edge colour, and grid colour
marker_size = 10
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform t-SNE with 2 components
tsne_data = TSNE(n_components=2, random_state=10000, perplexity=len(col_sample_Set)-1)
tsne = tsne_data.fit_transform(X)
# Create a DataFrame with t-SNE results and sample colors
tsne_df = pd.DataFrame(tsne, index=exp_table_norm_T.index, columns=['TSNE1', 'TSNE2'])
tsne_df['Colour'] = [sample_color_dict[sample] for sample in tsne_df.index]
color_discrete_map = {sample: tsne_df.loc[sample, 'Colour'] for sample in tsne_df.index}
# Create the scatter plot using px.scatter
fig = px.scatter(tsne_df,
x=tsne_df.columns[0],
y=tsne_df.columns[1],
color=tsne_df.index, # Use tsne_df.index for coloring
color_discrete_map=color_discrete_map, # Map index values to colors
hover_name=tsne_df.index)
# Update the layout of the 2D scatter plot
fig.update_layout(
width=1000,
height=750,
autosize=False,
margin=dict(l=70, r=20, b=100, t=50, pad=0),
template='plotly_white',
hovermode='closest',
xaxis_title=tsne_df.columns[0],
yaxis_title=tsne_df.columns[1],
legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)
fig.update_traces(marker=dict(size=marker_size,
line=dict(width=marker_linewidth,
color=marker_edgecolor)),
selector=dict(mode='markers'))
# Update the font style, size, and color
fig.update_layout(font=dict(family='Arial', size=15, color='black'))
# Define the graph path for storing
TSEN_2D_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_TSEN.html")
fig.write_html(TSEN_2D_Plot_path)
print(f"{directory}_2D_TSEN.html saved to {TSEN_2D_Plot_path}")
# Show the figure
# fig.show()
plt.close()
RQ023682_2D_TSEN.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_TSEN.html
Interactive 2D UMAP ¶
InĀ [51]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
# Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
#Customise marker size, line width, edge colour and grid colour
marker_size = 15
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=2,
random_state=77,
n_epochs = 500
)
umap_fits = umap_data.fit_transform(X)
# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)
# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(2)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]
color_discrete_map = {sample: umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}
# Assign the colums with colr per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]
# Create the scatter plot using px.scatter
fig = px.scatter(umap_2d_df,
x=umap_2d_df.columns[0],
y=umap_2d_df.columns[1],
color=umap_2d_df.index, # Use umap_2d_df.index for coloring
color_discrete_map=color_discrete_map, # Map index values to colors
hover_name=umap_2d_df.index)
# Update the layout of the 2D scatter plot
fig.update_layout(
width=1000,
height=750,
autosize=False,
margin=dict(l=70, r=20, b=100, t=50, pad=0),
template='plotly_white',
hovermode='closest',
xaxis_title=umap_2d_df.columns[0],
yaxis_title=umap_2d_df.columns[1],
legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)
fig.update_traces(marker=dict(size=marker_size,
line=dict(width=marker_linewidth,
color=marker_edgecolor)),
selector=dict(mode='markers'))
# Update the font style, size, and color
fig.update_layout(font=dict(family='Arial', size=15, color='black'))
# Define the graph path for storing
UMAP_2D_Interactive_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.html")
fig.write_html(UMAP_2D_Interactive_Plot_path)
print(f"{directory}_2D_UMAP.html saved to {UMAP_2D_Interactive_Plot_path}")
# Show the figure
# fig.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
RQ023682_2D_UMAP.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.html
InĀ [52]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
# Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
#Customise marker size, line width, edge colour and grid colour
marker_size = 15
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3,
random_state=77,
n_epochs = 500
)
umap_fits = umap_data.fit_transform(X)
# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)
# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]
color_discrete_map = {sample: umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}
# Assign the colums with colr per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]
# Create the scatter plot using px.scatter
fig = px.scatter(umap_2d_df,
x=umap_2d_df.columns[0],
y=umap_2d_df.columns[1],
color=umap_2d_df.index, # Use umap_2d_df.index for coloring
color_discrete_map=color_discrete_map, # Map index values to colors
hover_name=umap_2d_df.index)
# Update the layout of the 2D scatter plot
fig.update_layout(
width=1000,
height=750,
autosize=False,
margin=dict(l=70, r=20, b=100, t=50, pad=0),
template='plotly_white',
hovermode='closest',
xaxis_title=umap_2d_df.columns[0],
yaxis_title=umap_2d_df.columns[1],
legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)
fig.update_traces(marker=dict(size=marker_size,
line=dict(width=marker_linewidth,
color=marker_edgecolor)),
selector=dict(mode='markers'))
# Update the font style, size, and color
fig.update_layout(font=dict(family='Arial', size=15, color='black'))
# Define the graph path for storing
UMAP_2D_Interactive_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.html")
fig.write_html(UMAP_2D_Interactive_Plot_path)
print(f"{directory}_2D_UMAP.html saved to {UMAP_2D_Interactive_Plot_path}")
# Show the figure
# fig.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
RQ023682_2D_UMAP.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.html
InĀ [53]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
# Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
#Customise marker size, line width, edge colour and grid colour
marker_size = 15
marker_linewidth = 1
marker_edgecolor = 'black'
color_grid = 'lightgray'
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3,
random_state=77,
n_epochs = 500
)
umap_fits = umap_data.fit_transform(X)
# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)
# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]
color_discrete_map = {sample: umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}
# Assign the colums with colr per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]
# Create the scatter plot using px.scatter
fig = px.scatter(umap_2d_df,
x=umap_2d_df.columns[2],
y=umap_2d_df.columns[1],
color=umap_2d_df.index, # Use umap_2d_df.index for coloring
color_discrete_map=color_discrete_map, # Map index values to colors
hover_name=umap_2d_df.index)
# Update the layout of the 2D scatter plot
fig.update_layout(
width=1000,
height=750,
autosize=False,
margin=dict(l=70, r=20, b=100, t=50, pad=0),
template='plotly_white',
hovermode='closest',
xaxis_title=umap_2d_df.columns[2],
yaxis_title=umap_2d_df.columns[1],
legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)
fig.update_traces(marker=dict(size=marker_size,
line=dict(width=marker_linewidth,
color=marker_edgecolor)),
selector=dict(mode='markers'))
# Update the font style, size, and color
fig.update_layout(font=dict(family='Arial', size=15, color='black'))
# Define the graph path for storing
UMAP_2D_Interactive_Plot_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.html")
fig.write_html(UMAP_2D_Interactive_Plot_path)
print(f"{directory}_2D_UMAP.html saved to {UMAP_2D_Interactive_Plot_path}")
# Show the figure
# fig.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
RQ023682_2D_UMAP.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.html
2D UMAP (Two legends) ¶
InĀ [54]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
#Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
#Customise marker size, line width, edge colour and grid colour
marker_size = 20
marker_linewidth = 5
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=2,
random_state=77,
n_epochs = 500
)
umap_fits = umap_data.fit_transform(X)
# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)
# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(2)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]
# Create a dictionary mapping samples to colors
sample_color_dict = {sample.split('-')[1] : umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}
# Assign the columns with color per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]
# Create a list of drugs to exclude
control_to_exclude = ['DMSO', 'Baseline', 'mCherryPositive&BFPNegative', "mCherryNegative&BFPNegative", 'Serumfree']
umap_2d_df = umap_2d_df[~umap_2d_df.index.str.contains('|'.join(control_to_exclude))]
# Filter out the rows corresponding to the drugs to exclude
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
umap_2d_df = umap_2d_df.loc[~umap_2d_df.index.str.strip().isin(drug_to_exclude)]
# Create the scatter plot using plt.scatter
plt.figure(figsize=(20, 16))
scatterplot = plt.scatter(umap_2d_df['UMAP1'],
umap_2d_df['UMAP2'],
c=umap_2d_df['Colour'],
edgecolors=umap_2d_df['DrugCategory_color'],
s=200,
linewidth=3.0
)
# Custom legend for common drugs with their marker colors
common_drugs = [index.split('-')[1] for index in umap_2d_df.index]
common_drugs_unique = list(OrderedDict.fromkeys(common_drugs)) # Convert to a list of unique drug names
common_drug_legend_handles = []
for drug in common_drugs_unique:
drug_color = umap_2d_df.loc[umap_2d_df.index.str.contains(drug), 'Colour'].iloc[0]
legend_label = drug
common_drug_legend_handles.append(Patch(facecolor=drug_color, edgecolor='none', label=legend_label))
# Custom legend for drug categories with their colors
drug_categories = umap_2d_df['DrugCategory'].unique()
drug_categories_color = umap_2d_df['DrugCategory_color'].unique()
drug_category_legend_handles = []
for category, edge_color in zip(drug_categories, drug_categories_color):
drug_category_legend_handles.append(Patch(facecolor='none', edgecolor=edge_color, linewidth = 3, label=category))
# Customize the graph
ax = plt.gca()
ax.set(facecolor = "white")
ax.grid(color='#F5F5F5') # Set gridline color to black
ax.tick_params(axis='both', labelsize=14)
ax.set_xlabel('UMAP1', fontsize=14) # Change font size for x-axis label
ax.set_ylabel('UMAP2', fontsize=14) # Change font size for y-axis label
# Show the custom legends
common_legend = ax.legend(handles=common_drug_legend_handles,
bbox_to_anchor=(1.05, 1),
loc='upper left',
ncol=1,
prop ={'size': 15},
title="Sample Color",
title_fontsize= 16,
facecolor = 'white',
borderaxespad=0.)
drug_category_legend = ax.legend(handles=drug_category_legend_handles,
bbox_to_anchor=(1.05, 0.5),
loc='upper left',
ncol=1,
prop ={'size': 15},
title="Category Color",
title_fontsize= 16,
facecolor = 'white',
borderaxespad=0.)
ax.add_artist(common_legend)
# Saving the figure
UMAP_2D_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.svg")
plt.savefig(UMAP_2D_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_2D_UMAP.svg saved to {UMAP_2D_path}")
# Show the figure
# plt.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
RQ023682_2D_UMAP.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.svg
2D UMAP (Single legend) ¶
InĀ [55]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
#Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
#Customise marker size, line width, edge colour and grid colour
marker_size = 50
marker_linewidth = 5
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3,
random_state = 77,
n_epochs = 500
)
umap_fits = umap_data.fit_transform(X)
# Assign data to dataframe with index (sample names)
umap_2d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)
# Assign the columns with a color per sample
umap_2d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_2d_df['Colour'] = [sample_color_dict[sample] for sample in umap_2d_df.index]
# Create a dictionary mapping samples to colors
sample_color_dict = {sample.split('-')[1] : umap_2d_df.loc[sample, 'Colour'] for sample in umap_2d_df.index}
# Assign the columns with color per category
umap_2d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_2d_df.index]
umap_2d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_2d_df['DrugCategory']]
# Create a list of drugs to exclude
control_to_exclude = [
# 'DMSO',
'Baseline',
'mCherryPositive&BFPNegative',
"mCherryNegative&BFPNegative",
'Serumfree'
]
umap_2d_df = umap_2d_df[~umap_2d_df.index.str.contains('|'.join(control_to_exclude))]
# Filter out the rows corresponding to the drugs to exclude
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
umap_2d_df = umap_2d_df.loc[~umap_2d_df.index.str.strip().isin(drug_to_exclude)]
# Create the scatter plot using plt.scatter
plt.figure(figsize=(10, 10))
scatterplot = plt.scatter(umap_2d_df['UMAP1'],
umap_2d_df['UMAP2'],
c=umap_2d_df['Colour'],
edgecolors=umap_2d_df['DrugCategory_color'],
s=200,
linewidth=3.0
)
# Custom legend for common drugs with their marker colors
common_drugs = [index.split('-')[1] for index in umap_2d_df.index]
common_drugs_unique = list(OrderedDict.fromkeys(common_drugs)) # Convert to a list of unique drug names
common_drug_legend_handles = []
for drug in common_drugs_unique:
drug_color = umap_2d_df.loc[umap_2d_df.index.str.contains(drug), 'Colour'].iloc[0]
legend_label = drug
common_drug_legend_handles.append(Patch(facecolor=drug_color, edgecolor='none', label=legend_label))
# Custom legend for drug categories with their colors
drug_categories = umap_2d_df['DrugCategory'].unique()
drug_categories_color = umap_2d_df['DrugCategory_color'].unique()
drug_category_legend_handles = []
for category, edge_color in zip(drug_categories, drug_categories_color):
drug_category_legend_handles.append(Patch(facecolor='none', edgecolor=edge_color, linewidth = 3, label=category))
# Calculate the range of UMAP1 and UMAP2
umap1_range = umap_2d_df['UMAP1'].max() - umap_2d_df['UMAP1'].min()
umap2_range = umap_2d_df['UMAP2'].max() - umap_2d_df['UMAP2'].min()
max_range = max(umap1_range, umap2_range)
# Customize the graph
ax = plt.gca()
# Set the axis limits with the same range for both axes
ax.set_xlim(umap_2d_df['UMAP1'].mean() - max_range / 2, umap_2d_df['UMAP1'].mean() + max_range / 2)
ax.set_ylim(umap_2d_df['UMAP2'].mean() - max_range / 2, umap_2d_df['UMAP2'].mean() + max_range / 2)
ax.grid(False)
ax.set(facecolor = "white")
ax.tick_params(axis='both', labelsize=16)
ax.set_xlabel('UMAP1', fontsize=16) # Change font size for x-axis label
ax.set_ylabel('UMAP2', fontsize=16) # Change font size for y-axis label
# Show the axis lines
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['top'].set_visible(True)
# Set the color of the spines to black
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['top'].set_color('black')
# Set the maximum number of ticks to display on the y-axis as integers (no decimals)
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
# Set the aspect ratio to be equal, making the axis square
plt.axis('equal')
ax.set_aspect('auto')
title_font = {'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 18}
plt.title("UMAP for 16 Chemotherapeutics and Control", fontdict=title_font)
drug_category_legend = ax.legend(handles=drug_category_legend_handles,
bbox_to_anchor=(0.02, 0.99),
loc='upper left',
ncol=1,
prop ={'size': 10},
title="Category Color",
title_fontsize= 16,
facecolor = 'white',
borderaxespad=0.)
# Saving the figure
UMAP_2D_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP.svg")
plt.savefig(UMAP_2D_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_2D_UMAP.svg saved to {UMAP_2D_path}")
# Show the figure
# plt.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
RQ023682_2D_UMAP.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP.svg
2D UMAP (UMAP 2 & 3)¶
InĀ [56]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
#Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
#Customise marker size, line width, edge colour and grid colour
marker_size = 50
marker_linewidth = 5
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3,
random_state = 77,
n_epochs = 500
)
umap_fits = umap_data.fit_transform(X)
# Assign data to dataframe with index (sample names)
umap_3d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)
# Assign the columns with a color per sample
umap_3d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_3d_df['Colour'] = [sample_color_dict[sample] for sample in umap_3d_df.index]
# Create a dictionary mapping samples to colors
sample_color_dict = {sample.split('-')[1] : umap_3d_df.loc[sample, 'Colour'] for sample in umap_3d_df.index}
# Assign the columns with color per category
umap_3d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_3d_df.index]
umap_3d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_3d_df['DrugCategory']]
# Create a list of drugs to exclude
control_to_exclude = [
# 'DMSO',
'Baseline',
'mCherryPositive&BFPNegative',
"mCherryNegative&BFPNegative",
'Serumfree'
]
umap_3d_df = umap_3d_df[~umap_3d_df.index.str.contains('|'.join(control_to_exclude))]
# Filter out the rows corresponding to the drugs to exclude
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
umap_3d_df = umap_3d_df.loc[~umap_3d_df.index.str.strip().isin(drug_to_exclude)]
# Create the scatter plot using plt.scatter, but only showing UMAP2 and UMAP3
plt.figure(figsize=(10, 10))
scatterplot = plt.scatter(umap_3d_df['UMAP3'],
umap_3d_df['UMAP2'],
c=umap_3d_df['Colour'],
edgecolors=umap_3d_df['DrugCategory_color'],
s=200,
linewidth=3.0
)
# Custom legend for common drugs with their marker colors
common_drugs = [index.split('-')[1] for index in umap_3d_df.index]
common_drugs_unique = list(OrderedDict.fromkeys(common_drugs)) # Convert to a list of unique drug names
common_drug_legend_handles = []
for drug in common_drugs_unique:
drug_color = umap_3d_df.loc[umap_3d_df.index.str.contains(drug), 'Colour'].iloc[0]
legend_label = drug
common_drug_legend_handles.append(Patch(facecolor=drug_color, edgecolor='none', label=legend_label))
# Custom legend for drug categories with their colors
drug_categories = umap_3d_df['DrugCategory'].unique()
drug_categories_color = umap_3d_df['DrugCategory_color'].unique()
drug_category_legend_handles = []
for category, edge_color in zip(drug_categories, drug_categories_color):
drug_category_legend_handles.append(Patch(facecolor='none', edgecolor=edge_color, linewidth = 3, label=category))
# Calculate the range of UMAP1 and UMAP2
umap2_range = umap_3d_df['UMAP3'].max() - umap_3d_df['UMAP3'].min()
umap3_range = umap_3d_df['UMAP2'].max() - umap_3d_df['UMAP2'].min()
max_range = max(umap2_range, umap3_range)
# Customize the graph
ax = plt.gca()
# Set the axis limits with the same range for both axes
ax.set_xlim(umap_3d_df['UMAP3'].mean() - max_range / 2, umap_3d_df['UMAP3'].mean() + max_range / 2)
ax.set_ylim(umap_3d_df['UMAP2'].mean() - max_range / 2, umap_3d_df['UMAP2'].mean() + max_range / 2)
ax.grid(False)
ax.set(facecolor = "white")
ax.tick_params(axis='both', labelsize=16)
ax.set_xlabel('UMAP3', fontsize=16) # Change font size for x-axis label
ax.set_ylabel('UMAP2', fontsize=16) # Change font size for y-axis label
# Show the axis lines
ax.spines['left'].set_visible(True)
ax.spines['bottom'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['top'].set_visible(True)
# Set the color of the spines to black
ax.spines['left'].set_color('black')
ax.spines['bottom'].set_color('black')
ax.spines['right'].set_color('black')
ax.spines['top'].set_color('black')
# Set the maximum number of ticks to display on the y-axis as integers (no decimals)
ax.yaxis.set_major_locator(ticker.MaxNLocator(integer=True))
# Set the aspect ratio to be equal, making the axis square
plt.axis('equal')
ax.set_aspect('auto')
title_font = {'family': 'DejaVu Sans', 'color': 'black', 'weight': 'bold', 'size': 18}
plt.title("UMAP for 16 Chemotherapeutics and Control", fontdict=title_font)
drug_category_legend = ax.legend(handles=drug_category_legend_handles,
bbox_to_anchor=(0.02, 0.99),
loc='upper left',
ncol=1,
prop ={'size': 10},
title="Category Color",
title_fontsize= 16,
facecolor = 'white',
borderaxespad=0.)
UMAP_2D_2_3_path = os.path.join(graphs_files_original, f"{directory}_2D_UMAP_2_3.svg")
plt.savefig(UMAP_2D_2_3_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_2D_UMAP_2_3.svg saved to {UMAP_2D_2_3_path}")
plt.close()
RQ023682_2D_UMAP_2_3.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_2D_UMAP_2_3.svg
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
Interactive 3D UMAP¶
InĀ [57]:
# Define the sample names
col_sample_Set = [column for column in batch_df.columns if any(sample in column for sample in name_list)]
#Transpose dataframe
exp_table_norm_T = batch_df[col_sample_Set].T
# Customise marker size, line width, edge colour and grid colour
marker_size = 10
marker_linewidth = 0.2
marker_edgecolor = 'black'
color_grid = 'lightgray'
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample_Set if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample_Set) - len(overlap_samples)
color_palette = px.colors.qualitative.Alphabet[:num_colors]
# Assign colors to the remaining samples in col_sample_Set
col_colors = dict(zip(col_sample_Set, color_palette))
sample_color_dict.update(col_colors)
# Assign values to a new variable and replace NaN with zeros
X = exp_table_norm_T.values
X = np.nan_to_num(X)
# Perform UMAP with 2 components instead of 3
umap_data = umap.UMAP(n_components=3,
random_state = 77,
n_epochs = 500
)
umap_fits = umap_data.fit_transform(X)
# Assign data to dataframe with index (sample names)
umap_3d_df = pd.DataFrame(umap_fits, index=exp_table_norm_T.index)
# Assign the columns with a color per sample
umap_3d_df.columns = ['UMAP%s' % (i + 1) for i in range(3)]
umap_3d_df['Colour'] = [sample_color_dict[sample] for sample in umap_3d_df.index]
# Create a dictionary mapping samples to colors
sample_color_dict = {sample.split('-')[1] : umap_3d_df.loc[sample, 'Colour'] for sample in umap_3d_df.index}
# Assign the columns with color per category
umap_3d_df['DrugCategory'] = [next((category for category, drugs in drug_category.items() if sample.split('-')[1] in drugs), 'Unknown') for sample in umap_3d_df.index]
umap_3d_df['DrugCategory_color'] = [category_colors[category] if category in category_colors else 'Unknown' for category in umap_3d_df['DrugCategory']]
# Create a list of drugs to exclude
control_to_exclude = [
# 'DMSO',
'Baseline',
'mCherryPositive&BFPNegative',
"mCherryNegative&BFPNegative",
'Serumfree'
]
umap_3d_df = umap_3d_df[~umap_3d_df.index.str.contains('|'.join(control_to_exclude))]
# Filter out the rows corresponding to the drugs to exclude
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
umap_3d_df = umap_3d_df.loc[~umap_3d_df.index.str.strip().isin(drug_to_exclude)]
# Create a figure object
fig = go.Figure()
# Iterate through each sample and add scatter plot trace to the figure
for sample in umap_3d_df.index:
umap_3d_trace = go.Scatter3d(
x=[umap_3d_df.loc[sample, umap_3d_df.columns[0]]], # Use the first component as 'x' values
y=[umap_3d_df.loc[sample, umap_3d_df.columns[1]]], # Use the second component as 'y' values
z=[umap_3d_df.loc[sample, umap_3d_df.columns[2]]], # Use the third component as 'z' values
mode='markers',
marker=dict(
size=5, # Adjust marker size
line=dict(width=marker_linewidth, color=marker_edgecolor),
color=pca_3d.loc[sample, 'Colour'], # Use the 'Colour' column for coloring
),
name=sample # Use the sample name as the legend label
)
fig.add_trace(umap_3d_trace)
# Update the layout of the 3D scatter plot
fig.update_layout(
width=1000,
height=750,
autosize=False,
margin=dict(l=70, r=20, b=100, t=50, pad=0),
template='plotly_white',
hovermode='closest',
scene=dict(
xaxis=dict(
title=umap_3d_df.columns[0], # Set x-axis title using the first component column name
gridcolor=color_grid,
zerolinecolor=color_grid,
showbackground=False,
backgroundcolor='rgb(230, 230,230)'
),
yaxis=dict(
title=umap_3d_df.columns[1], # Set y-axis title using the second component column name
gridcolor=color_grid,
zerolinecolor=color_grid,
showbackground=False,
backgroundcolor='rgb(230, 230, 230)'
),
zaxis=dict(
title=umap_3d_df.columns[2], # Set z-axis title using the third component column name
gridcolor=color_grid,
zerolinecolor=color_grid,
showbackground=False,
backgroundcolor='rgb(230, 230,230)'
),
aspectratio=dict(x=0.9, y=0.9, z=0.9),
aspectmode='manual'
),
legend=dict(orientation='v', x=1.02, y=0.5, xanchor='left', yanchor='middle', title='Samples', borderwidth=1, bgcolor='rgba(255, 255, 255, 0.9)'),
)
fig.update_traces(marker=dict(size=marker_size, # Adjust marker size
line=dict(width=marker_linewidth, color=marker_edgecolor)),
selector=dict(mode='markers'))
fig.layout.font.family = 'Arial'
fig.layout.font.size = 15
fig.layout.font.color = 'black'
# Define the nor Histogram graph file path for graph storing
UMAP_3D_Plot_path = os.path.join(graphs_files_original, f"{directory}_3D_UMAP.html")
fig.write_html(UMAP_3D_Plot_path)
print(f"{directory}_3D_UMAP.html saved to {UMAP_3D_Plot_path}")
# Showing the figure
# fig.show()
plt.close()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
RQ023682_3D_UMAP.html saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_3D_UMAP.html
HDBSCAN ¶
InĀ [58]:
# Importing UMAP
from umap import UMAP
# Make copy of upmap_2d_df
HDBSCAN_df = umap_2d_df.copy()
# Extract the 'DrugCategory' column for clustering
drug_categories = HDBSCAN_df[['DrugCategory']]
# Initialize and fit the KModes clusterer
n_clusters = 4 # You can adjust the number of clusters as needed
km = KModes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=1)
clusters = km.fit_predict(drug_categories)
# Add the cluster labels to your DataFrame
HDBSCAN_df['Cluster_DrugCategory'] = clusters
Init: initializing centroids Init: initializing clusters Starting iterations... Run 1, iteration: 1/100, moves: 0, cost: 5.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 2, iteration: 1/100, moves: 0, cost: 15.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 3, iteration: 1/100, moves: 0, cost: 5.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 4, iteration: 1/100, moves: 0, cost: 12.0 Init: initializing centroids Init: initializing clusters Starting iterations... Run 5, iteration: 1/100, moves: 0, cost: 5.0 Best run was number 1
InĀ [59]:
# Define legend labels for each cluster
legend_labels = [f'Cluster {i}' for i in range(len(HDBSCAN_df['Cluster_DrugCategory'].unique()))]
# Create a scatter plot
plt.figure(figsize=(10, 6))
scatter = plt.scatter(HDBSCAN_df['UMAP1'], HDBSCAN_df['UMAP2'], c=HDBSCAN_df['Cluster_DrugCategory'], cmap='viridis')
plt.title('UMAP Projection of Clusters by Drug Category')
# Create a distance matrix
umap_result = HDBSCAN_df[['UMAP1', 'UMAP2']].values
distance_matrix = distance_matrix(umap_result, umap_result)
# Build the minimum spanning tree
mst = minimum_spanning_tree(coo_matrix(distance_matrix))
# Plot the spanning tree
edges = mst.toarray()
edge_coordinates = np.argwhere(edges)
# Plot the spanning tree
plt.plot(umap_result[edge_coordinates[:, 0], 0], umap_result[edge_coordinates[:, 0], 1], 'k-', alpha=0.2)
plt.legend(handles=scatter.legend_elements()[0], labels=legend_labels, title="Clusters")
plt.show()
InĀ [60]:
# Create a distance matrix
umap_result = HDBSCAN_df[['UMAP1', 'UMAP2']]
# Build the minimum spanning tree using HDBSCAN
clusterer = hdbscan.HDBSCAN(algorithm='best', min_cluster_size=2, gen_min_span_tree=True,
approx_min_span_tree=True, leaf_size=40,
metric='euclidean', min_samples=None, p=None)
clusterer.fit(umap_result)
# # Plot the minimum spanning tree
# clusterer.minimum_spanning_tree_.plot(
# edge_cmap='viridis',
# edge_alpha=0.6,
# node_size=10,
# edge_linewidth=1
# )
clusterer.single_linkage_tree_.plot(cmap='viridis', colorbar=True)
# clusterer.condensed_tree_.plot()
plt.title('Minimum Spanning Tree in UMAP Space')
plt.show()
InĀ [61]:
# Group the data by 'DrugCategory' and calculate the median points for UMAP1 and UMAP2
category_medians = HDBSCAN_df.groupby('DrugCategory')[['UMAP1', 'UMAP2']].median()
# Perform hierarchical clustering based on the category medians using complete linkage
Z = linkage(category_medians, method='complete')
# Plot the dendrogram with custom labels and labeled Y-axis
plt.figure(figsize=(12, 6))
dendrogram(Z, orientation='top', labels=category_medians.index)
plt.title('Hierarchical Clustering Dendrogram of Drug Categories based on Medians')
plt.xticks(rotation=90)
# Label the Y-axis
plt.ylabel('Linkage Distance')
plt.show()
Scatter Plot ¶
InĀ [62]:
# Assign handle - only when its necessary
scatter_plot_handle = ""
if scatter_plot_handle == "Yes":
# Assign here because it overlaps with R function of stats
from scipy import stats
# Define variables for marker size, color, transparency, and font sizes
marker_size = 0.3
marker_color = 'royalblue'
marker_alpha = 0.6
corr_fontsize = 8
axis_fontsize = 10
# Modify the default parameters of matplotlib for customization
plt.rcParams["axes.labelsize"] = axis_fontsize
plt.rcParams["axes.facecolor"] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams["svg.fonttype"] = 'none'
# Define a function to create a scatterplot matrix
def scatter_matrix_lower(df):
# Calculate Pearson correlation coefficient between two variables
def corrfunc(x, y, **kwargs):
# using the 'pearsonr' function from 'stats' module in scipy
r, _ = stats.pearsonr(x, y)
# Add an annotation with the correlation coefficient value to the plot
ax = plt.gca()
ax.annotate("$p$ = {:.2f}".format(r),
xy=(.3, .9), xycoords=ax.transAxes, fontsize=corr_fontsize)
#Enable gridlines
plt.grid(True)
# Create a PairGrid object from seaborn to visualize pairwise relationships
grid = sns.PairGrid(data=df, vars=list(df), height=2) # Increase the height parameter as per your preference
# Plot scatterplots in the lower triangle of the PairGri
grid.map_lower(plt.scatter, s=marker_size, color=marker_color, alpha=marker_alpha)
# Add correlation coefficient annotations to the scatterplots
grid.map_lower(corrfunc)
# Modify properties of the PairGrid, such as transparency and tick marks
grid.set(alpha=1)
grid.set(xticks=[])
grid.set(yticks=[])
# Set an empty title for the figure
grid.fig.suptitle('')
# Call the scatter_matrix_lower function with a subset of the DataFrame
scatter_matrix_lower(batch_df[col_sample_Set].dropna(how='any', axis=0))
# Adjust the plot layout and display the scatterplot matrix
plt.tight_layout()
# Define the TPM Histogram graph file path for graph storing
Scatter_Plot_path = os.path.join(graphs_files_original, f"{directory}_ScatterPlot_PCC.svg")
plt.savefig(Scatter_Plot_path, format='svg')
print(f"Merged read_summary.svg saved to {Scatter_Plot_path}")
else:
print("Scatter Plot handle is not assigned")
Scatter Plot handle is not assigned
Venn Diagram (Verified Gene Sets) ¶
InĀ [63]:
# Import hORFeome 9.1 datasheet to get verified genes
hORFeome9_1 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20220830_hORFeome 9.1.xlsx")
verified_hORFeome9_1 = hORFeome9_1[hORFeome9_1['Verified'] == 1]
# Create a copy of stat_filtered_df
original_stat_filtered_df = batch_compile_df.copy()
# Create a dictionary mapping entrez_gene_symbol to Verified values from hORFeome9_1
gene_symbol_to_verified = hORFeome9_1.set_index('entrez_gene_symbol')['Verified'].to_dict()
# Add a new "Verified" column to original_stat_filtered_df based on Gene_Symbol using .map
original_stat_filtered_df['Verified'] = batch_compile_df['Gene_Symbol'].map(gene_symbol_to_verified)
# Get the index of the "Gene_Symbol" column
gene_symbol_index = original_stat_filtered_df.columns.get_loc('Gene_Symbol')
# Create a list of columns with the "Verified" column moved to the desired position
new_columns = list(original_stat_filtered_df.columns)
new_columns.insert(gene_symbol_index + 1, 'Verified')
# Reorder the columns in the original_stat_filtered_df DataFrame
original_stat_filtered_df = original_stat_filtered_df[new_columns]
# Drop any duplicate "Verified" columns
original_stat_filtered_df = original_stat_filtered_df.loc[:, ~original_stat_filtered_df.columns.duplicated()]
# Filter the Verified Columns
verified_stat_filtered_df = original_stat_filtered_df[original_stat_filtered_df['Verified'] == 1]
# Convert to gene sets
verified_genes = set(verified_stat_filtered_df['Gene_Symbol'])
Original_genes = set(verified_hORFeome9_1['entrez_gene_symbol'])
# Calculate the intersections
intersection = verified_genes.intersection(Original_genes)
only_verified_genes = verified_genes - Original_genes
only_original_genes = Original_genes - verified_genes
# Create the Venn diagram
fig, ax = plt.subplots(figsize=(12, 8)) # Adjust figure size to provide space for labels
venn = venn2(subsets=(len(only_verified_genes), len(only_original_genes), len(intersection)),
set_labels=('', ''), # Leave set labels blank as we'll add them manually
set_colors=('cyan', 'grey'),
ax=ax)
# Add custom labels with boxes outside the Venn diagram
plt.text(-0.9, 0.0, 'Verified Genes\nfrom analysis', fontsize=12,
bbox=dict(facecolor='cyan', edgecolor='black', boxstyle='round,pad=0.5'))
plt.text(0.65, 0.0, 'Verified Genes\nfrom hORFeome 9.1', fontsize=12,
bbox=dict(facecolor='lightgrey', edgecolor='black', boxstyle='round,pad=0.5'))
# Calculate the percentage of interaction
intersection_percentage = (len(intersection) / len(Original_genes)) * 100
# Calculate the percentage of interaction
intersection_percentage = (len(intersection) / len(Original_genes)) * 100
# Add the percentage text inside the plot with a box around it
plt.text(0.5, 0.6, f"Common Verified Genes: {intersection_percentage:.1f}%",
transform=plt.gca().transAxes,
horizontalalignment='center', verticalalignment='center', fontsize=10,
bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5'))
# Add a title
plt.title("Venn Diagram for Verified Genes", fontsize=14)
# Define the file path for saving the graph
Verified_Genes_path = os.path.join(graphs_files_original, f"{directory}_Venn_Diagram(Verified_Genes).svg")
plt.savefig(Verified_Genes_path, format='svg', bbox_inches='tight', dpi=300)
print(f"Venn Diagram for Verified Genes saved to {Verified_Genes_path}")
# Show the plot
plt.close()
Venn Diagram for Verified Genes saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Venn_Diagram(Verified_Genes).svg
Statistical Analysis ¶
Function Input ¶
InĀ [64]:
def Zscore(X):
# Standardize rows using Z-scores
# Z-Score : A numerical measurement that describes a value's relationship to the mean of the group of values and is mesaured in terms of standard deviations from the mean
# X: numpy ndarray
[m,n] = X.shape # m and n as number of rows and columns
mx = np.nanmean(X,axis=0) # Calculate the sample mean for each column, ignoring NaN values
stdx = np.nanstd(X,axis=0) # Calculate the sample standard deviation for each column, ignoring NaN values
ax = np.divide(X - mx, stdx) # Calculate Z-scores to make sure each column has a standard deviation of 1
return ax
InĀ [65]:
def stat_uneq(x, y):
# Statistics for identifying DEGs - t-stat, rank-stat and median difference
# [t-stat,rank,mdiff] = stat2(x,y)
# Input arguments
# x : treatment data (m x nx)
# y : control data (m x ny)
# All statistical tests are performed by x-y
# Output arguments
# tstat : t-statistic from two sample t-test
# rank : statistic from Wilcoxon's ranksum test
# mdiff : median difference
[m, n] = np.shape(x) # Extract the dimensions of the input arrays 'x' and 'y'
nx = np.sum(~np.isnan(x),axis=1) # Count sum of the non-NaN values in each row of 'x'
ny = np.sum(~np.isnan(y),axis=1) # Count sum of the non-NaN values in each row of 'y'
# Median difference (originally median)
mdiff = np.nanmedian(x,axis=1) - np.nanmedian(y,axis=1) # Calculate the median difference between corresponding columns of 'x' and 'y'
# two-sample t-test (unequal variance of two groups is assumed)
difference = np.nanmean(x,axis=1) - np.nanmean(y,axis=1) # Calculate the mean difference between corresponding columns of 'x' and 'y'
s2x = np.nanvar(x,axis=1) # Calculate the sample variances along each row of 'x'.
s2y = np.nanvar(y,axis=1) # Calculate the sample variances along each row of 'y'.
# Variance: Measure of sample dispersion, it is a measure of how far a set of numbers is spread out from average value
s2xbar = np.divide(s2x, nx) # Calculate the sample variance per element for 'x'.
s2ybar = np.divide(s2y, ny) # Calculate the sample variance per element for 'y'.
# Step is to calculate degree of freedom
dfe = np.divide(np.power(s2xbar + s2ybar, 2), np.divide(np.power(s2xbar,2), (nx-1)) + np.divide(np.power(s2ybar,2), (ny-1)))
# Degree of Freedom: The maximum number of logically indepedent values, which are values that have fredome to vary, in the data sample
se = np.sqrt(s2xbar + s2ybar)
# Standard Error: How far the sample variance deviates from the actual population mean. The larger the sample, the smaller the SE
t = np.divide(difference, se) # Compute t-statistics
# Ranksum Statistic
r = pd.DataFrame(np.concatenate((x, y),axis=1).T).rank(method='average',axis=0).to_numpy()
# Rank the transposed and concatenated data, 'x' and 'y', using the 'average' ranking method and convert it to a NumPy array.
w = np.zeros([m])
# Initialize an array 'w' with zeros.
wvar = np.zeros([m])
# Initialize an array 'wvar' with zeros.
for i in range(m):
tmp_nx = nx[i] # Calling loop variable for nx
tmp_ny = ny[i] # Calling loop variable for ny
wvar[i] = ((tmp_nx * tmp_ny) * (tmp_nx + tmp_ny + 1)) / 12 # Variance calculation of the rank-sum statistic
if tmp_nx <= tmp_ny:
w[i] = np.nansum(r[0:tmp_nx,i])
# If treatment is higher than control Calculate 'w' as the sum of ranked values in the first 'tmp_nx' rows of column 'i' in 'r'.
if np.sum(~np.isnan(r[0:tmp_nx,i])) == 0 :
w[i] = np.nan
wvar[i] = np.nan
# If it is equal to 0, set 'w' and 'wvar' to NaN.
elif tmp_nx > tmp_ny:
w[i] = np.nansum(r[tmp_nx:(tmp_nx + tmp_ny),i])
# If control is higher than treatment, Calculate 'w' as the sum of ranked values in between tmp_nx and tmp_ny rows of column 'i' in 'r'.
if np.sum(~np.isnan(r[tmp_nx:(tmp_nx + tmp_ny),i])) == 0 :
w[i] = np.nan
wvar[i] = np.nan
# If it is equal to 0, set 'w' and 'wvar' to NaN.
min_n =np.min(np.c_[nx,ny], axis=1)
# Calculate the minimum count of non-NaN values between 'nx' and 'ny'.
# Z-Transformation
wmean = np.multiply(min_n, nx + ny + 1) / 2
# Calculates the expected mean of the rank-sum statistic under the null hypothesis.
wc = w - wmean
# Centers the rank-sum statistic by subtracting the expected mean.
wz = np.divide(wc - 0.5 * np.sign(wc), np.sqrt(wvar))
# Standardizes the centered rank-sum statistic using the Z-transformation.
wz[np.isnan(wz)] = 0
wz[np.isinf(wz)] = 0
# set any NaN (Not-a-Number) or infinite values in the Z-transformed rank-sum statistic wz to 0
t[np.isnan(t)] = 0
t[np.isinf(t)] = 0
# set any NaN or infinite values in the t-statistic t to 0
return t, w, mdiff, wz
# t - array of t-statistics one for each row between treatment group (x) and control group (y). A higher absolute value of t indicates a larger difference between the groups
# w - array of rank-sum statistics between treatment group (x) and control group (y). Quantifies difference by ranks of values within group. A higher absolute value w indicates a larger difference
# mdiff - array of median difference. Each element represents the difference in medians between the treatment and control groups for each gene.
# wz - array of Z-transformed rank-sum statistics. Standardize the rank sum to have mean of 0 and standard deviation of 1 under the null hypothesis. This transformation makes it easier to compare the significance of the rank-sum statistics across different genes
InĀ [66]:
def permutation_test(X, treat_num, control_num, Iter_num):
# X: row - genes (a data matrix with genes in rows and samples in columns)
# column - samples
# treat_num (the number of samples in the treatment group)
# control_num (the number of samples in the control group)
# Iter_num (the number of permutations to perform)
X = np.array(X) # Convert genes
[row_num, col_num] = X.shape # Determines the number of rows and columns in the data matrix X and stores these values in row_num and col_num.
try:
if col_num < (treat_num + control_num):
# checks if the total number of samples (col_num) is less than the sum of the treatment and control group sizes (treat_num and control_num).
raise Exception('Total sample number exceeds data matrix column size!')
except Exception as e:
print('InputError: ',e)
# Initialize variables
t = [] # Initializes an array 't' to store t-statistics for each gene and permutation.
w = [] # Initializes an array 'w' to store rank-sum statistics for each gene and permutation.
m = [] # Initializes an array 'm' to store median differences for each gene and permutation.
wz = [] # Initializes an array 'wz' to store Z-transformed rank-sum statistics for each gene and permutation.
# Place to store the results of permuation test.
# Initialize tqdm for progress tracking
with tqdm(total=Iter_num) as pbar:
for i in range(Iter_num):
col_order = np.random.permutation(col_num) # Generates a random permutation of column indices.
xp = X[:,col_order[0:treat_num]] # Selects the first 'treat_num' columns as the treatment group.
yp = X[:,col_order[treat_num:treat_num + control_num]] # Selects the next 'control_num' columns as the control group.
# Calculates various statistics using the 'stat_uneq' function for the shuffled data.
t_i, w_i, m_i, wz_i = stat_uneq(xp, yp)
# Append the statistics for this permutation to the respective lists
t.append(t_i)
w.append(w_i)
m.append(m_i)
wz.append(wz_i)
# Update tqdm progress
pbar.update()
# Convert the lists of statistics to NumPy arrays
t = np.array(t)
w = np.array(w)
m = np.array(m)
wz = np.array(wz)
return t, w, m, wz # Returns arrays containing the results of the permutation test for each gene.
InĀ [67]:
# Define a function realcomp that takes two inputs: x (data) and opt (transformation option).
def realcomp(x,opt):
# Get the dimensions of the input data matrix 'x'.
[n, k] = x.shape
# Check the transformation option 'opt' to determine which transformation to apply.
if opt == 1: # Chi-squared transformation
# Apply the chi-squared transformation to 'x'.
z = -2 * np.log(x)
# Count the non-NaN values in each row of the transformed 'z'.
k = np.sum(~np.isnan(z), axis=1)
# Calculate the sum of the non-NaN values in each row of 'z'.
z = np.nansum(z, axis=1)
# Calculate 'y' values using the chi-squared cumulative distribution function.
y = 1 - chi2.cdf(z, 2 * k)
elif opt == 2: # Logit transformation
# Apply the logit transformation to 'x'.
z = np.log(np.divide(x, 1 - x))
# Count the non-NaN values in each row of the transformed 'z'.
k = np.sum(~np.isnan(z), axis=1)
# Perform additional calculations on 'z'.
z = np.multiply(np.nansum(z, axis=1), np.divide(-np.sqrt(15 * k + 12), np.multiply((5 * k + 2), k * np.power(np.pi, 2))))
# Calculate 'y' values using the t-distribution cumulative distribution function.
y = 1 - t.cdf(z, 5 * k + 4)
elif opt == 3: # Standard Normal (Z-Score) Transformation
# Apply the standard normal transformation to 'x'.
z = -norm.ppf(x)
# Count the non-NaN values in each row of the transformed 'z'.
k = np.sum(~np.isnan(z), axis=1)
# Perform additional calculations on 'z'.
z = np.divide(np.nansum(z, axis=1), np.sqrt(k))
# Calculate 'y' values using the standard normal cumulative distribution function.
y = 1 - norm.cdf(z)
# Return the computed final trasnformed data 'y' values and the intermediate transformed data 'z', depending on the option.
return y, z
InĀ [68]:
def vidz(d, opt, idx):
# Check the number of columns in the input data
if d.shape[1] > 2:
# If more than 2 columns, apply transformation using the specified option
px, _ = np.array(realcomp(d[:,:-1], opt))
else:
# If 2 or fewer columns, take the first column as px
px = d[:,0]
# Plot the transformed data against the last column of the input data
plt.plot(px.reshape(px.size, 1), d[:,-1].reshape(d[:,-1].size, 1), 'g.')
# Mark specific indices on the plot with magenta circles
plt.plot(px[idx].reshape(px[idx].size, 1), d[idx,-1].reshape(d[idx,-1].size, 1), 'mo')
InĀ [69]:
def nwpv(d,opt):
# Set a small value for epsilon
eps = np.finfo(d.dtype).eps
# Get the dimensions of the input data
[n, k] = d.shape
# Iterate over each column in the input data
for i in range(k):
# Replace zeros in the column with small values
tn = np.array(np.where(d[:,i] == 0))
if np.array(tn).size != 0:
d[np.where(d[:,i] == 0),i] = np.linspace(1.e-10, eps, len(tn)).T
# Replace very small values with epsilon
if np.array(np.where(d[:,i] < eps)).size != 0:
d[np.where(d[:,i] < eps),i] = eps
# Replace values close to 1 with a value slightly less than 1
if np.array(np.where(d[:,i] > 0.99999999999999994)).size != 0:
d[np.where(d[:,i] > 0.99999999999999994),i] = 0.99999999999999994
# Apply a transformation to the modified data using the specified option
[y, z] = realcomp(d, opt)
# Find indices where the transformed values are less than 0.05
idx = np.where(y < 0.05)
# Plot the modified data and mark specific indices
vidz(d, opt, idx)
# Return the transformed values and the marked indices
return y, idx
InĀ [70]:
def pval2tail(s0, s):
# p-value computation routine
# s0: null statistic, s: observed statistic
# p-value: computed by two-tail test
# Stack the null statistics s0 into a 1D array
stacked_s0 = np.hstack(s0)
# Compute the empirical cumulative distribution function (ECDF) for the null statistics
ecdf_res = ECDF(stacked_s0)
# Extract unique values (excluding the first one) and their corresponding cumulative probabilities
s0 = ecdf_res.x[1:]
f0 = ecdf_res.y[1:]
# Create a function that interpolates the ECDF values
f = interp1d(s0, f0, bounds_error=False)
# Compute the p-value by evaluating the interpolated function at the observed statistic s
p = f(s)
# Replace any NaN values in p with 0
p[np.isnan(p)] = 0
# Calculate the two-tailed p-value by taking the minimum of p and 1-p and multiplying by 2
p = 2 * np.min(np.c_[p, 1 - p], axis=1)
# Handle special cases when p-values are 0 or 1
if np.sum(p != 0) != 0:
# If there are non-zero p-values, replace zeros with the smallest non-zero value divided by 2
p[p == 0] = np.nanmin(p[p != 0]) / 2
else:
# If all p-values are zero, replace them with a small non-zero value
p[p == 0] = 1e-10
# Adjust p-values that are exactly 1 to ensure they are within [0, 1]
p[p == 1] = (1 - np.max(p[p != 1])) / 2 + np.max(p[p != 1])
return p
InĀ [71]:
def pval_calc(trnd, wrnd, mrnd, wzrnd, x, y, pscale, pcomb):
# Input arguments
# x: treatment data (m by nx)
# y: control data (m by ny)
# pscale: type=bool, if True, scale p-value using Liptak-Stouffer's Z method
# pcomb: p-value combination option
# 1 - pt & pm
# 2 - pwz & pm
# 3 - pt & pwz & pm
# Calculate various statistics for treatment and control data
[tobs, wobs, mobs, wzobs] = stat_uneq(x, y)
# Calculate one-tailed p-values for different statistics
pt = pval2tail(trnd, tobs) # p-value for t-statistic
pw = pval2tail(wrnd, wobs) # p-value for ranksum statistic
pm = pval2tail(mrnd, mobs) # p-value for median difference
pwz = pval2tail(wzrnd, wzobs) # p-value for Z-transformed ranksum statistic
# Combine p-values based on pcomb option
if pcomb == 1:
pall = np.c_[pt, pm] # Combine t-statistic and median difference p-values
elif pcomb == 2:
pall = np.c_[pwz, pm] # Combine Z-transformed ranksum and median difference p-values
elif pcomb == 3:
pall = np.c_[pt, pwz, pm] # Combine t-statistic, Z-transformed ranksum, and median difference p-values
# Scale p-values using Liptak-Stouffer's Z method if pscale is True
if pscale:
pall = 1 - norm.cdf(Zscore(-norm.ppf(pall)))
# Calculate overall p-value using non-parametric weighted p-value combination
ovp = np.array(nwpv(pall, 3)[0])
return pt, pw, pm, pwz, ovp
# pt represents the one-tailed p-values for the t-statistic. It measures significanace difference between treatment vs control groups, based on means.
# pw is the one-tailed p-value associated with the Wilcoxon rank-sum statistic. This statistic assesses the difference in location (median) between the treatment vs control groups.
# pm is the one-tailed p-value for the median difference between the treatment and control groups. It measures the significance of differences in medians.
# pwz represents the one-tailed p-values for the Z-transformed ranksum statistic. The Z-transform standardizes the ranksum statistic, making it easier to compare significance across genes.
Statistical dataframe¶
InĀ [72]:
"""
Unlike original database (Not removing any fault columns - Less than ~ 7,000 genes),
removing all the uncessary columns for stat analysis (except mCherry+/BFP- etc)
"""
# Read the table from XLSX
data_rc = pd.read_excel(f"{merged_xlsx_path}")
# Cleaning out dAAVS1 and pDest from the dataframe
exclude_keywords = ["dAAVS1", "pDest"]
data_nor = data_rc[~data_rc.apply(lambda row: any(keyword in str(row) for keyword in exclude_keywords), axis=1)]
# Drop the columns that have less than half of maximum genes
# exclude_columns = ["63-mCherryPositive&BFPNegative", "64-mCherryNegative&BFPNegative"]
Less_than_half_of_max = gene_counts[gene_counts < (max_gene_count /2) ].index
Less_than_half_of_max = list(Less_than_half_of_max)
Less_columns = Less_than_half_of_max
data_nor = data_nor.drop(Less_columns, axis=1)
# Changing ORF_ID columns to number and sort the dataframe
data_nor["ORF_ID"] = pd.to_numeric(data_nor["ORF_ID"], errors="coerce")
data_nor = data_nor.sort_values("ORF_ID").reset_index(drop=True)
# Select data based on header values
columns_to_include = [column for column in data_nor.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
data_col = data_nor[columns_to_include].astype(float)
# Add length for length normalization
gene_info = data_nor[["Gene_Symbol", "Length"]]
gene_merge = gene_info.merge(data_col, left_index=True, right_index=True)
# Set dataframes for gene expression normalziation
data_edgeR = data_col
data_bioinfokit = gene_merge
# norm function associated with scipy.
from bioinfokit.analys import norm
# Define the normalization method you want to use: 'CPM', 'TMM', 'GeTMM' or 'TPM', 'RPKM'
chosen_normalization = 'TMM' # Change this to your desired normalization method
if chosen_normalization == 'CPM':
# Load your data and replace NaN with 0
data_cpm = data_edgeR.fillna(0)
# Convert pandas DataFrame to R data frame
data_raw_r = df_to_r_dataframe(data_cpm)
# Calculate CPM in R
norm_raw = edgeR.cpm(data_raw_r)
norm_log = edgeR.cpm(data_raw_r, log=True)
# Access column names
norm_colnames = list(data_raw_r.colnames)
elif chosen_normalization in ['GeTMM', 'TMM']:
if chosen_normalization == 'GeTMM':
# Load your data and replace NaN with 0 and drop Gene_Symbol
data_norm = data_bioinfokit.fillna(0)
data_norm = data_norm.drop(['Gene_Symbol'], axis=1)
# Assuming 'data_norm' is your DataFrame
data_norm['Length'] = data_norm['Length'] / 10**3
data_norm.iloc[:, 1:] = data_norm.iloc[:, 1:].div(data_norm['Length'], axis=0)
# Select data based on header values
columns_to_include = [col for col in data_bioinfokit.columns if any(any(sample in col for sample in samples) for samples in sample_key.values())]
data_norm = data_norm[columns_to_include]
elif chosen_normalization == 'TMM':
data_norm = data_edgeR.fillna(0)
# Set the GeTMM/TMM handle
getmm_tmm_handle = "Control" # Change this to your desired normalization factors
# Perform normalization and get results
dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r = edgeR_normfactor(data_norm, getmm_tmm_handle)
elif chosen_normalization in ['RPKM', 'TPM']:
if chosen_normalization == 'RPKM':
nm_method = 'rpkm'
elif chosen_normalization == 'TPM':
nm_method = 'tpm'
data_norm = data_bioinfokit.fillna(0)
# Convert 'Length' column to numeric (if it's not already)
data_norm['Length'] = pd.to_numeric(data_norm['Length'], errors='coerce')
# Make 'Gene_Symbol' column as the index column
data_norm.set_index('Gene_Symbol', inplace=True)
nm = norm()
getattr(nm, nm_method)(df=data_norm, gl='Length')
# Get the normalized DataFrame
nor_df = getattr(nm, f'{nm_method}_norm')
# Reset index back to default integer index
nor_df.reset_index(drop=True, inplace=True)
nor_raw = nor_df
# Calculate the logarithm of nor values (base 2, with a small constant added)
avoid_nan = 0.18050946883 # Mimic edgeR cpm(log=True)
nor_log = (np.log2(nor_raw + avoid_nan)).astype(float)
# Convert nor_raw and nor_log DataFrame to array
norm_raw = nor_raw.values
norm_log = nor_log.values
# Access column names
norm_colnames = nor_raw.columns.tolist()
else:
raise ValueError(f"Invalid normalization method: {chosen_normalization}. Choose 'CPM', 'TMM', 'GeTMM' or 'TPM', 'RPKM'.")
# cpm_raw to dataframe for storage
nor_raw_df = pd.DataFrame(data=np.where(norm_raw != 0, norm_raw, np.nan), columns=norm_colnames)
nor_log_df = pd.DataFrame(data=np.where(norm_log != 0, norm_log, np.nan), columns=norm_colnames)
# Saving the merged DataFrame
nor_raw_path = os.path.join(database_files_stats, f"{directory}_stats_nor_raw.xlsx")
nor_raw_compile_df = save_dataframe(fasta_data, nor_raw_df, nor_raw_path)
nor_log_path = os.path.join(database_files_stats, f"{directory}_stats_nor_log.xlsx")
nor_log_compile_df = save_dataframe(fasta_data, nor_log_df, nor_log_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_stats_nor_raw.xlsx DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_stats_nor_log.xlsx
Normalization factors ¶
InĀ [73]:
#Retrieving right data
columns_to_include = [column for column in nor_raw_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
norm_data = nor_raw_compile_df[columns_to_include]
norm_data = norm_data.dropna(how='all')
norm_data = norm_data.astype(float)
norm_data = np.log2(norm_data)
# Extract the column names from the pandas DataFrame
col_headers = norm_data.columns.tolist()
# Convert pandas DataFrame to R data frame
nor_clean_quant_batch_r = df_to_r_dataframe(norm_data)
# Initialize variables
batch_assignment = []
NGS_assignment = []
control_or_experimental = []
current_batch = 0
# Function to get the batch name based on the current batch number
def get_batch_name(batch_number):
if batch_number == 0:
return "BatchBaseline"
else:
return "Batch" + str(batch_number)
def determine_control_or_experimental(header):
if "Baseline" in header :
return "Baseline"
elif "DMSO" in header :
return "Control"
elif "mCherryPositive&BFPNegative" in header or "mCherryNegative&BFPNegative" in header:
return "plasmid"
else:
return "Experiment"
# Assign batches based on the header pattern
for name in col_headers:
control_or_experimental.append(determine_control_or_experimental(name))
if "Baseline" in name:
current_batch += 1
batch_assignment.append(get_batch_name(current_batch))
if current_batch <= 4:
NGS_assignment.append("NGS1")
elif current_batch == 5:
NGS_assignment.append("NGS2")
# Convert the batch_assignment list to an R vector
batch = robjects.vectors.StrVector(batch_assignment)
NGS = robjects.vectors.StrVector(NGS_assignment)
control_exp = robjects.vectors.StrVector(control_or_experimental)
# R vector to R factor
batch = robjects.r.factor(batch)
NGS = robjects.r.factor(NGS)
control_exp = robjects.r.factor(control_exp)
Noise detection ¶
InĀ [74]:
stats = importr("stats")
# Histogram of log2 to determine proper cutoff
median_log2_nor = base.apply(norm_log, 1, stats.median)
graphics.hist(median_log2_nor)
expr_cutoff = -1.0
graphics.abline(v=expr_cutoff, col="red", lwd=3)
expr_count = base.sum(FloatVector(np.array(median_log2_nor) > expr_cutoff))[0]
# Plot histogram using Python
plt.figure(figsize=(18, 9)) # Adjust the figure size as desired
# Calculate the bin width and adjust the bar width and spacing
num_bins = 50
data_range = np.ptp(median_log2_nor)
bin_width = data_range / num_bins
bar_width = 3.0 * bin_width
bar_spacing = bin_width - bar_width
# Plot the histogram with adjusted bar width and spacing
plt.hist(median_log2_nor, bins=num_bins, range=(np.min(median_log2_nor), np.max(median_log2_nor)),
color='black', edgecolor='black', linewidth=0.5,
rwidth=bar_width, align='mid')
# Set the background color and gridlines
plt.gca().set_facecolor('lightgray')
plt.grid(color='white', linestyle='-', linewidth=0.5)
plt.axvline(x=expr_cutoff, color='red', linewidth=3)
plt.title('Histogram of log2 nor', fontsize=16)
plt.xlabel('Log2 nor', fontsize=16)
plt.ylabel('No of Genes in log2', fontsize=16)
plt.yscale('log')
plt.tick_params(axis='both', labelsize=12)
print("Total number of genes after the cutoff:", int(expr_count))
# Define the nor Histogram graph file path for graph storing
nor_histogram_path = os.path.join(graphs_files_stats, f"{directory}_stats_norm_histogram.svg")
plt.savefig(nor_histogram_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_stats_norm_histogram.svg saved to {nor_histogram_path}")
# Showing the figure
# plt.close()
plt.show()
Total number of genes after the cutoff: 14506 RQ023682_stats_norm_histogram.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_norm_histogram.svg
Noise removal from dataframe ¶
InĀ [75]:
# Convert median_log2_nor to a numpy array
median_log2_nor_np = np.array(median_log2_nor)
# Create a boolean mask based on the expression cutoff
mask = median_log2_nor_np > expr_cutoff
unmask = median_log2_nor_np <= expr_cutoff
# Get the indices where the mask is True
indices = np.where(mask)[0]
non_indices = np.where(unmask)[0]
# Subset nor_raw based on the indices
nor_clean = norm_raw[indices, :]
# Subset nor_raw based on not in the indices
nor_unclean = norm_raw[non_indices, :]
# Access column names
if chosen_normalization in ['RPKM', 'TPM']:
nor_colnames = nor_raw.columns.tolist()
elif chosen_normalization in ['CPM', 'TMM', 'GeTMM']:
nor_colnames = list(data_raw_r.colnames)
# nor_raw to dataframe for storage
nor_clean_df = pd.DataFrame(data=np.where(nor_clean != 0, nor_clean, np.nan), index=indices, columns=nor_colnames)
nor_unclean_df = pd.DataFrame(data=np.where(nor_unclean != 0, nor_unclean, np.nan), index=non_indices, columns=nor_colnames)
# Saving the merged DataFrame
nor_clean_path = os.path.join(database_files_stats, f"{directory}_stats_nor_clean.xlsx")
nor_clean_compile_df = save_dataframe(fasta_data, nor_clean_df, nor_clean_path)
# Saving the merged Dataframe
nor_unclean_path = os.path.join(database_files_stats, f"{directory}_stats_nor_unclean.xlsx")
nor_unclean_compile_df = save_dataframe(fasta_data, nor_unclean_df, nor_unclean_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_stats_nor_clean.xlsx DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_stats_nor_unclean.xlsx
Correlation matrix clustering (Pre-normalization) ¶
InĀ [76]:
# Get the column names from data_raw
col_names = nor_clean_df.columns.values.tolist()
# Calculate the correlation matrix in R
cor_matrix = stats.cor(nor_clean, use="everything", method ="pearson")
# Convert the correlation matrix to a NumPy array
cor_matrix_np = np.asarray(cor_matrix)
# Compute the condensed distance matrix
dist_matrix = pdist(cor_matrix_np)
# Perform hierarchical clustering with the condensed distance matrix
linkage_matrix = hierarchy.linkage(dist_matrix, method='average')
dendrogram_row = hierarchy.dendrogram(linkage_matrix, no_plot=True)
# Get the order of rows and columns from the dendrogram
order_row = dendrogram_row['leaves']
order_col = dendrogram_row['leaves']
# Reorder the correlation matrix based on the clustering
cor_matrix_ordered = cor_matrix_np[order_row][:, order_col]
# Perform hierarchical clusterisng and plot the heatmap with clustering
sns.set(font_scale=0.7)
g = sns.clustermap(cor_matrix_np, cmap='coolwarm', cbar_pos=(1, 0.2, 0.03, 0.5), cbar_kws={'label': 'Correlation'},
dendrogram_ratio=(0.1, 0.1), linewidths=0.5,
xticklabels=col_names, yticklabels=col_names)
# Rotate the x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)
# Set the title above the clustering
g.ax_heatmap.set_title('Heatmap of Correlation Matrix with Clustering', fontsize=16, pad=20, loc='center', y=1.15)
# Define the nor Histogram graph file path for graph storing
PCC_Heatmap_path = os.path.join(graphs_files_stats, f"{directory}_stats_PCC_Heatmap.svg")
plt.savefig(PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=300)
print(f"{directory}_stats_PCC_Heatmap.svg saved to {PCC_Heatmap_path}")
# Showing the figure
plt.close()
# plt.show()
RQ023682_stats_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_PCC_Heatmap.svg
Box & Violin Plot (Pre-normalization) ¶
InĀ [77]:
# Removing nan values
nor_clean_compile_df.replace(['nan'], np.nan, inplace=True)
# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = nor_clean_compile_df[columns_to_include].astype(float)
sample_data = sample_data.dropna(how='all')
sample_data = sample_data.astype(float)
# Get the gene symbols corresponding to the data points
gene_symbols = nor_clean_compile_df.loc[sample_data.index, 'Gene_Symbol']
# Combine the sample data and gene symbols into a single DataFrame
data_melted = pd.concat([sample_data, gene_symbols], axis=1)
# Extract the sample names from the column headers
sample_names = sample_data.columns.str.split('_', expand=True).get_level_values(0)
# Melt the data for plotting
data_melted = data_melted.melt(id_vars=['Gene_Symbol'], value_vars=sample_names, var_name='Samples', value_name='Expression Level')
# Calculate the 10th and 90th percentile of the expression level for each sample
lower_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.1)
upper_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.9)
# Calculate the number of unique samples
num_unique_samples = len(data_melted['Samples'].unique())
# Define the sample names
include_baselline = name_list.copy()
new_value = ["Baseline", "mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"]
merged_samples_list = include_baselline + new_value
col_sample = [column for column in nor_clean_compile_df.columns if any(sample in column for sample in merged_samples_list)]
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample) - len(overlap_samples)
color_palette = sns.color_palette("Set1", n_colors=num_colors) # Or any other color palette you prefer
# Assign colors to the remaining samples in col_sample
col_colors = dict(zip(col_sample, color_palette))
sample_color_dict.update(col_colors)
# Calculate the figsize dynamically based on the number of unique samples
fig_width = min(12, 1.5 + num_unique_samples * 1)
fig_height = min(12, 1.5 + num_unique_samples * 0.5)
# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(fig_width, fig_height))
# Plot the box plot in the first subplot
sns.boxplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[0], palette=sample_color_dict)
axs[0].set_xscale('log')
axs[0].set_title('Whisker Box Plot normal data')
axs[0].set_xlabel('Log Expression Level')
axs[0].set_ylabel('Samples')
# Plot the violin plot in the second subplot
sns.violinplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[1], inner='quart',
palette=sample_color_dict
)
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)]) # Use quantiles as x-axis labels
axs[1].set_xscale('log')
axs[1].set_title('Violin Plot normal data')
axs[1].set_xlabel('Quantile Log Expression Level')
axs[1].set_ylabel('Samples')
# Define the nor Histogram graph file path for graph storing
Pre_Box_Violin_Plot_path = os.path.join(graphs_files_stats, f"{directory}_stats_Pre_Box_Violin_Plot.svg")
plt.savefig(Pre_Box_Violin_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_stats_Pre_Box_Violin_Plot.svg saved to {Pre_Box_Violin_Plot_path}")
# Show the plot
plt.tight_layout()
plt.close()
# plt.show()
/tmp/ipykernel_1548459/1344428164.py:69: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
RQ023682_stats_Pre_Box_Violin_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_Pre_Box_Violin_Plot.svg
Density Plot (Pre-normalization) ¶
InĀ [78]:
# Preparation of additional dataframe for density plot
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
Pre_norm_density = nor_clean_compile_df[columns_to_include]
Pre_norm_density = np.log2(Pre_norm_density.dropna(how='all').astype(float))
col_renames = [f'{batch}_{NGS}_{control_exp}' for batch, NGS, control_exp in zip(batch_assignment, NGS_assignment, control_or_experimental)]
Pre_norm_density.columns = col_renames
Pre_norm_renamed = Pre_norm_density
# Create a list of unique batches, NGS, and control/experiment groups
unique_batches = sorted(set(batch_assignment))
unique_NGS = sorted(set(NGS_assignment))
unique_control_exp = sorted(set(control_or_experimental))
# Set up the plot grid
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 6))
# Overlay density plots for batches
for i, batch in enumerate(unique_batches):
batch_columns = [col for col in Pre_norm_renamed.columns if batch in col]
batch_data = Pre_norm_renamed[batch_columns].stack().reset_index(drop=True)
sns.kdeplot(data=batch_data, fill=True, ax=axes[0], label=f'{batch}')
axes[0].set_title('Overlayed Density Plots for Batches')
axes[0].set_xlabel('Gene Expression Level')
axes[0].legend()
# Overlay density plots for NGS
for i, NGS in enumerate(unique_NGS):
NGS_columns = [col for col in Pre_norm_renamed.columns if NGS in col]
NGS_data = Pre_norm_renamed[NGS_columns].stack().reset_index(drop=True)
sns.kdeplot(data=NGS_data, fill=True, ax=axes[1], label=f'{NGS}')
axes[1].set_title('Overlayed Density Plots for NGS')
axes[1].set_xlabel('Gene Expression Level')
axes[1].legend()
# Overlay density plots for control/experiment
for i, control_exp in enumerate(unique_control_exp):
control_exp_columns = [col for col in Pre_norm_renamed.columns if control_exp in col]
control_exp_data = Pre_norm_renamed[control_exp_columns].stack().reset_index(drop=True)
sns.kdeplot(data=control_exp_data, fill=True, ax=axes[2], label=f'{control_exp}')
axes[2].set_title('Overlayed Density Plots for Control/Experiment')
axes[2].set_xlabel('Gene Expression Level')
axes[2].legend()
# Define the Density plot file path for graph storing
Pre_Density_Plot_path = os.path.join(graphs_files_stats, f"{directory}_stats_Pre_density_Plot.svg")
plt.savefig(Pre_Density_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_stats_Pre_density_Plot.svg saved to {Pre_Density_Plot_path}")
# Showing the figure
plt.tight_layout()
# plt.show()
plt.close()
RQ023682_stats_Pre_density_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_Pre_density_Plot.svg
Winsorization (DescTools) ¶
InĀ [79]:
nor_clean_compile_df
Out[79]:
| ORF_ID | NCBI | Group | Gene_Symbol | GC_Content | 1-Baseline-batch1 | 2-DMSO-A1 | 3-DMSO-B1 | 4-DMSO-C1 | 5-Paclitaxel-A | ... | 62-Vinblastine-C | 63-mCherryPositive&BFPNegative | 64-mCherryNegative&BFPNegative | 68-Baseline-batch5 | 69-DMSO-A | 70-DMSO-B | 71-DMSO-C | 72-TAS102-A | 73-TAS102-B | 74-TAS102-C | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.0 | 1.0 | 805.0 | G06 | CALM2 | 39.111111 | 191.267044 | 150.099265 | 144.572297 | 140.791295 | 129.090270 | ... | 246.151411 | 184.665331 | 69.709985 | 156.997897 | 80.086365 | 94.963157 | 135.269365 | 115.335034 | 102.790008 | 68.967454 |
| 1.0 | 2.0 | 2629.0 | G02 | GBA | 55.245189 | 21.923890 | 21.167845 | 17.460422 | 17.189635 | 33.467848 | ... | 9.567723 | 22.159840 | 28.916734 | 16.204521 | 18.925308 | 21.168870 | 17.780832 | 56.132640 | 28.393814 | 49.123045 |
| 2.0 | 3.0 | 10282.0 | G03 | BET1 | 38.375350 | 124.739377 | 141.118967 | 97.778365 | 144.884065 | 148.214754 | ... | 192.224247 | 155.939613 | 14.974737 | 96.961476 | 96.242116 | 82.499243 | 49.966388 | 120.597469 | 113.355148 | 69.292772 |
| 4.0 | 6.0 | 7178.0 | G02 | TPT1 | 44.123314 | 78.623607 | 155.872313 | 114.540370 | 84.311066 | 69.667765 | ... | 113.073086 | 115.723607 | 19.622070 | 60.036421 | 57.006721 | 75.377006 | 49.741314 | 71.700677 | 38.518740 | 35.459682 |
| 5.0 | 7.0 | 8089.0 | G01 | YEATS4 | 36.111111 | 100.547498 | 129.572870 | 114.540370 | 106.412025 | 159.143031 | ... | 58.276129 | 120.648016 | 137.870859 | 134.152179 | 101.781231 | 106.042192 | 105.784694 | 120.816738 | 61.629983 | 91.089090 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18383.0 | 100080862.0 | 653427.0 | delta | FOXD4L5 | 61.280000 | 18.899906 | 35.921192 | 24.444591 | 28.649391 | 31.418796 | ... | 4.348965 | 18.056166 | 32.531326 | 11.688507 | 17.540530 | 27.895427 | 37.587328 | 18.857059 | 8.804283 | 10.735500 |
| 18384.0 | 100080864.0 | 389058.0 | delta | SP5 | 57.239627 | 31.751841 | 28.865243 | 27.936676 | 45.020472 | 53.275349 | ... | 20.875031 | 38.574536 | 36.662288 | 26.830436 | 29.541945 | 27.104068 | 24.983194 | 37.494850 | 52.385486 | 33.507773 |
| 18385.0 | 100080865.0 | 642623.0 | delta | UBTFL1 | 63.686636 | 34.775826 | 12.828997 | 2.793668 | 19.645297 | 8.879225 | ... | 17.395859 | 41.857475 | 17.556589 | 69.865392 | 56.775925 | 28.686787 | 53.342495 | 7.235848 | 20.249852 | 27.652045 |
| 18386.0 | 100080869.0 | 100131980.0 | delta | ZNF705G | 65.692308 | 22.679887 | 15.394796 | 32.825594 | 20.463851 | 15.026381 | ... | 40.880270 | 54.168497 | 6.712813 | 25.502196 | 20.079290 | 22.553750 | 7.652510 | 32.013146 | 24.431886 | 25.374818 |
| 18387.0 | 100080871.0 | 7617.0 | delta | ZNF66 | 62.395076 | 1.511992 | NaN | NaN | 0.818554 | 1.366035 | ... | NaN | 0.820735 | 5.163703 | 4.516014 | 2.769557 | 2.176239 | 2.025664 | 1.096341 | 5.062463 | 1.301273 |
14506 rows Ć 72 columns
InĀ [80]:
# Select the columns containing the sample data
columns_to_include = [column for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = nor_clean_compile_df[columns_to_include].astype(float)
#Set the quantile values for winsorization
lower_quantile = 0.00001
upper_quantile = 0.99999
# Turn pandas dataframe to R dataframe
winsorize_r = df_to_r_dataframe(sample_data)
# Run DescTools winsorization
winsorize_df = winsorize_func(winsorize_r, lower_quantile, upper_quantile)
# Save the windorized database
windorize_path = os.path.join(database_files_stats, f"{directory}_windsorize.xlsx")
windorize_compile_df = save_dataframe(fasta_data, winsorize_df, windorize_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_windsorize.xlsx
Upper Quantile Normalization (edgeR) ¶
InĀ [81]:
# Upper quantile normalization function
def upper_quantile_normfactor(data, handle):
experimental_columns = data.columns
if handle == "Control":
# Define your control and experimental keywords
neg_control_keywords = ["DMSO", "Baseline"]
none_keywords = ["mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"
# , "Serumfree"
]
# Determine group names based on column names
group_names = [
"Control" if any(keyword in col for keyword in neg_control_keywords)
else ("None" if any(keyword in col for keyword in none_keywords)
else "Experimental")
for col in experimental_columns
]
# Determine group names based on column names
group_names = [
"Control" if any(keyword in col for keyword in neg_control_keywords)
else ("Experimental")
for col in experimental_columns
]
elif handle == "Triplet":
# Extract the second part of each column name after splitting by underscore
extracted_keywords = [col.split('-')[1] for col in experimental_columns]
group_names = extracted_keywords
group_factor = robjects.FactorVector(group_names)
data_raw_r = df_to_r_dataframe(data)
dge = edgeR.DGEList(counts=data_raw_r, group=group_factor)
dge = edgeR.calcNormFactors(dge, method="upperquartile", p=0.75)
dge_normfactors_r = dge.rx2('samples')
with (robjects.default_converter + pandas2ri.converter).context():
dge_normfactors_df = robjects.conversion.get_conversion().rpy2py(dge_normfactors_r)
norm_raw = edgeR.cpm(dge)
norm_log = edgeR.cpm(dge, log=True)
norm_colnames = list(data_raw_r.colnames)
return dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r
# Fill in the NaN to 0
winsorize_data = winsorize_df.fillna(0)
# Set the handle
upperquant_handle = "Control"
# Perform normalization and get results
dge_normfactors_df, norm_raw, norm_log, norm_colnames, data_raw_r = upper_quantile_normfactor(winsorize_data, upperquant_handle)
# Return 0 values to NaN
nor_raw_df = pd.DataFrame(data=np.where(norm_raw != 0, norm_raw, np.nan), columns=norm_colnames)
# Set database to a dedicated index
nor_raw_df = nor_raw_df.set_index(winsorize_df.index)
# Set values to log2
nor_raw_df = np.log2(nor_raw_df)
# Save Upper quantile normalized database
upper_quant_compile_path = os.path.join(database_files_stats, f"{directory}_upper_quant.xlsx")
upper_quant_compile_df = save_dataframe(fasta_data, nor_raw_df, upper_quant_compile_path)
DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_upper_quant.xlsx
Batch correction (Limma) ¶
InĀ [82]:
# Preparation of additional dataframe for batch correction
columns_to_include = [column for column in nor_raw_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_data = nor_raw_df[columns_to_include]
batch_data = batch_data.dropna(how='all').astype(float)
# Convert the batch_assignment list to an R vector
batch = robjects.vectors.StrVector(batch_assignment)
NGS = robjects.vectors.StrVector(NGS_assignment)
control_exp = robjects.vectors.StrVector(control_or_experimental)
# R vector to R factor
batch = robjects.r.factor(batch)
NGS = robjects.r.factor(NGS)
control_exp = robjects.r.factor(control_exp)
# Metadata build up
col_names = [f'{control_exp}_{NGS}' for control_exp, NGS in zip(control_or_experimental, NGS_assignment)]
split_col_names = [name.split('_') for name in col_names]
# Organizing into a DataFrame
metadata_df = pd.DataFrame(split_col_names, columns=['control_exp', 'NGS'], index=col_names)
# Convert pandas DataFrame to R data frame
metadata_r = df_to_r_dataframe(metadata_df)
batch_data_col_r = df_to_r_dataframe(batch_data)
col_names_r = robjects.vectors.StrVector(col_names)
robjects.r.assign('batch_data_col_r', batch_data_col_r)
robjects.r.assign('col_names', col_names_r)
robjects.r('colnames(batch_data_col_r) <- col_names')
# Assign row names to the DataFrame (assuming index is the original names)
robjects.r.assign('metadata_r', metadata_r)
# Create a model matrix using metadata_df in R
if len(set(batch_assignment)) > 1:
model_matrix = robjects.r('model.matrix(~ 0 + control_exp + 0 + NGS, data = metadata_r)')
batch_names = ['Baseline', 'Control', 'Experiment', 'Plasmid', 'NGS']
model_matrix_df = pd.DataFrame(model_matrix, columns=batch_names)
# Convert pandas DataFrame to R data frame
model_matrix_r = df_to_r_dataframe(model_matrix_df)
batch_data_r = df_to_r_dataframe(batch_data)
else:
pass
# Check if batch is not empty
if len(set(batch_assignment)) > 1:
# Perform batch correction
batch_corrected_data = limma.removeBatchEffect(batch_data_r, batch=batch
,design=model_matrix_r
)
else:
# Skip batch correction
batch_corrected_data = upper_quant_compile_df
# Change R DataFrame to pandas DataFrame
batch_stat_df = pd.DataFrame(batch_corrected_data, columns=col_headers)
batch_stat_df = batch_stat_df.set_index(batch_data.index)
batch_stat_df.replace(0, np.nan, inplace=True)
# Saving the merged DataFrame
batch_compile_path = os.path.join(database_files_stats, f"{directory}_batch_corrected.xlsx")
batch_compile_stat_df = save_dataframe(fasta_data, batch_stat_df, batch_compile_path)
#If "coeeficient not estimable:" shows. It means levels are perfectly colinear or have no variability within them.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/rpy2/robjects/pandas2ri.py:56: UserWarning: DataFrame contains duplicated elements in the index, which will lead to loss of the row names in the resulting data.frame
Coefficients not estimable: batch4 DataFrame saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Statistical_db/RQ023682_batch_corrected.xlsx
Correlation matrix clustering (Post-normalization) ¶
InĀ [83]:
# Stats variable reconfirmed (Overlap between R and Scipy)
stats = importr("stats")
# Select the columns containing the sample data
batch_compile_df_corr = batch_compile_stat_df.fillna(0)
columns_to_include = [column for column in batch_compile_stat_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_compile_df_corr = batch_compile_df_corr[columns_to_include].astype(float)
# Convert pandas DataFrame to R data frame
batch_compile_df_corr_r = df_to_r_dataframe(batch_compile_df_corr)
# Get the column names from data_raw
col_names = batch_compile_df_corr.columns.values.tolist()
# Calculate the correlation matrix in R
cor_matrix = stats.cor(batch_compile_df_corr_r, use="everything", method ="pearson")
# Convert the correlation matrix to a NumPy array
cor_matrix_np = np.asarray(cor_matrix)
# Compute the condensed distance matrix
dist_matrix = pdist(cor_matrix_np)
# Perform hierarchical clustering with the condensed distance matrix
linkage_matrix = hierarchy.linkage(dist_matrix, method='average')
dendrogram_row = hierarchy.dendrogram(linkage_matrix, no_plot=True)
# Get the order of rows and columns from the dendrogram
order_row = dendrogram_row['leaves']
order_col = dendrogram_row['leaves']
# Reorder the correlation matrix based on the clustering
cor_matrix_ordered = cor_matrix_np[order_row][:, order_col]
# Perform hierarchical clusterisng and plot the heatmap with clustering
sns.set(font_scale=0.7)
g = sns.clustermap(cor_matrix_np, cmap='coolwarm', cbar_pos=(1, 0.2, 0.03, 0.5), cbar_kws={'label': 'Correlation'},
dendrogram_ratio=(0.1, 0.1), linewidths=0.5,
xticklabels=col_names, yticklabels=col_names)
# Rotate the x-axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)
# Set the title above the clustering
g.ax_heatmap.set_title('Heatmap of Correlation Matrix with Clustering', fontsize=16, pad=20, loc='center', y=1.15)
# Define the CPM Histogram graph file path for graph storing
Post_PCC_Heatmap_path = os.path.join(graphs_files_stats, f"{directory}_stats_Post_PCC_Heatmap.svg")
plt.savefig(Post_PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=300)
print(f"{directory}_stats_Post_PCC_Heatmap.svg saved to {PCC_Heatmap_path}")
# Showing the figure
# plt.close()
plt.show()
RQ023682_stats_Post_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_PCC_Heatmap.svg
Box & Violin Plot (Post-normalization) ¶
InĀ [84]:
# Removing nan values
batch_compile_stat_df.replace(['nan'], np.nan, inplace=True)
# Select the columns containing the sample data
columns_to_include = [column for column in batch_compile_stat_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
sample_data = batch_compile_stat_df[columns_to_include].astype(float)
sample_data = sample_data.dropna(how='all')
sample_data = sample_data.astype(float)
# Get the gene symbols corresponding to the data points
# gene_symbols = nor_clean_compile_df.iloc[sample_data.index, 'Gene_Symbol']
# Combine the sample data and gene symbols into a single DataFrame
data_melted = pd.concat([sample_data, gene_symbols], axis=1)
# Extract the sample names from the column headers
sample_names = sample_data.columns.str.split('_', expand=True).get_level_values(0)
# Melt the data for plotting
data_melted = data_melted.melt(id_vars=['Gene_Symbol'], value_vars=sample_names, var_name='Samples', value_name='Expression Level')
# Calculate the 10th and 90th percentile of the expression level for each sample
lower_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.1)
upper_quantile = data_melted.groupby('Samples')['Expression Level'].quantile(0.9)
# Calculate the number of unique samples
num_unique_samples = len(data_melted['Samples'].unique())
# Define the sample names
include_baselline = name_list.copy()
new_value = ["Baseline", "mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"]
merged_samples_list = include_baselline + new_value
col_sample = [column for column in nor_clean_compile_df.columns if any(sample in column for sample in merged_samples_list)]
# Create a dictionary mapping samples to colors
sample_color_dict = {}
overlap_samples = [sample for sample in col_sample if sample.split("-")[1] in drug_color_map]
# Assign colors from drug_color_map to overlapping samples
for sample in overlap_samples:
sample_color_dict[sample] = drug_color_map[sample.split("-")[1]]
# Generate a color palette with the desired number of colors
num_colors = len(col_sample) - len(overlap_samples)
color_palette = sns.color_palette("Set1", n_colors=num_colors) # Or any other color palette you prefer
# Assign colors to the remaining samples in col_sample
col_colors = dict(zip(col_sample, color_palette))
sample_color_dict.update(col_colors)
# Calculate the figsize dynamically based on the number of unique samples
fig_width = min(12, 1.5 + num_unique_samples * 1)
fig_height = min(12, 1.5 + num_unique_samples * 0.5)
# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(fig_width, fig_height))
# Plot the box plot in the first subplot
sns.boxplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[0], palette=sample_color_dict)
# axs[0].set_xscale('log')
axs[0].set_title('Whisker Box Plot normal data')
axs[0].set_xlabel('Log Expression Level')
axs[0].set_ylabel('Samples')
# Plot the violin plot in the second subplot
sns.violinplot(x='Expression Level', y='Samples', data=data_melted, ax=axs[1], inner='quart',
palette=sample_color_dict
)
# axs[1].set_xscale('log')
axs[1].set_xticklabels([f'{q:.1f}-{u:.1f}' for q, u in zip(lower_quantile, upper_quantile)]) # Use quantiles as x-axis labels
axs[1].set_title('Violin Plot normal data')
axs[1].set_xlabel('Quantile Log Expression Level')
axs[1].set_ylabel('Samples')
# Define the nor Histogram graph file path for graph storing
Pre_Box_Violin_Plot_path = os.path.join(graphs_files_stats, f"{directory}_stats_Pro_Box_Violin_Plot.svg")
plt.savefig(Pre_Box_Violin_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_stats_Pro_Box_Violin_Plot.svg saved to {Pre_Box_Violin_Plot_path}")
# Show the plot
plt.tight_layout()
plt.close()
# plt.show()
/tmp/ipykernel_1548459/1146168366.py:70: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
RQ023682_stats_Pro_Box_Violin_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_Pro_Box_Violin_Plot.svg
Density Plot (Post-normalization) ¶
InĀ [85]:
# Preparation of additional dataframe for density plot
columns_to_include = [column for column in batch_compile_stat_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
batch_data_pro_den = batch_compile_stat_df[columns_to_include]
batch_data_pro_den = batch_data_pro_den.dropna(how='all').astype(float)
col_renames = [f'{batch}_{NGS}_{control_exp}' for batch, NGS, control_exp in zip(batch_assignment, NGS_assignment, control_or_experimental)]
batch_data_pro_den.columns = col_renames
batch_data_renamed = batch_data_pro_den
# Create a list of unique batches, NGS, and control/experiment groups
unique_batches = sorted(set(batch_assignment))
unique_NGS = sorted(set(NGS_assignment))
unique_control_exp = sorted(set(control_or_experimental))
# Set up the plot grid
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 6))
# Overlay density plots for batches
for i, batch in enumerate(unique_batches):
batch_columns = [col for col in batch_data_renamed.columns if batch in col]
batch_data = batch_data_renamed[batch_columns].stack().reset_index(drop=True)
sns.kdeplot(data=batch_data, fill=True, ax=axes[0], label=f'{batch}')
axes[0].set_title('Overlayed Density Plots for Batches')
axes[0].set_xlabel('Gene Expression Level')
axes[0].legend()
# Overlay density plots for NGS
for i, NGS in enumerate(unique_NGS):
NGS_columns = [col for col in batch_data_renamed.columns if NGS in col]
NGS_data = batch_data_renamed[NGS_columns].stack().reset_index(drop=True)
sns.kdeplot(data=NGS_data, fill=True, ax=axes[1], label=f'{NGS}')
axes[1].set_title('Overlayed Density Plots for NGS')
axes[1].set_xlabel('Gene Expression Level')
axes[1].legend()
# Overlay density plots for control/experiment
for i, control_exp in enumerate(unique_control_exp):
control_exp_columns = [col for col in batch_data_renamed.columns if control_exp in col]
control_exp_data = batch_data_renamed[control_exp_columns].stack().reset_index(drop=True)
sns.kdeplot(data=control_exp_data, fill=True, ax=axes[2], label=f'{control_exp}')
axes[2].set_title('Overlayed Density Plots for Control/Experiment')
axes[2].set_xlabel('Gene Expression Level')
axes[2].legend()
# Define the Density plot file path for graph storing
Pro_Density_Plot_path = os.path.join(graphs_files_stats, f"{directory}_stats_Pro_density_Plot.svg")
plt.savefig(Pre_Density_Plot_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_Pre_density_Plot.svg saved to {Pre_Density_Plot_path}")
# Showing the figure
plt.tight_layout()
# plt.show()
plt.close()
RQ023682_Pre_density_Plot.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_stats_Pre_density_Plot.svg
Venn Diagram (Verified Gene Sets) ¶
InĀ [86]:
Total_ORFs = pd.to_numeric(data_rc['NCBI'], errors='coerce').dropna().sort_values()
clean_ORFs = pd.to_numeric(batch_compile_stat_df['NCBI'], errors='coerce').dropna()
Total_Sets = set(Total_ORFs)
Clean_Sets = set(clean_ORFs)
intersection = Clean_Sets.intersection(Total_Sets)
only_Total_genes = Total_Sets - intersection
only_Clean_genes = Clean_Sets - Total_Sets
fig, ax = plt.subplots(figsize=(16, 12)) # Adjust figure size to provide space for labels
venn = venn2(subsets=(len(only_Total_genes), len(only_Clean_genes), len(intersection)),
set_labels=('Total ORF', 'Clean ORF'), # <<< --- THIS HIDES THE 'A' AND 'B' LABELS
# set_colors=("#BABDFF", '#CEFFBA'),
ax=ax)
Total_genes_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Venn_Diagram(Total_Genes).svg")
plt.savefig(Total_genes_path , format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Verified_Genes_path}")
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Venn_Diagram(Verified_Genes).svg
InĀ [87]:
# Import hORFeome 9.1 datasheet to get verified genes
hORFeome9_1 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20220830_hORFeome 9.1.xlsx")
verified_hORFeome9_1 = hORFeome9_1[hORFeome9_1['Verified'] == 1]
unverified_hORFeome9_1 = hORFeome9_1[hORFeome9_1['Verified'] == 0]
# Create a copy of stat_filtered_df
original_stat_filtered_df = batch_compile_df.copy()
# Create a dictionary mapping entrez_gene_symbol to Verified values from hORFeome9_1
gene_symbol_to_verified = hORFeome9_1.set_index('entrez_gene_symbol')['Verified'].to_dict()
# Add a new "Verified" column to original_stat_filtered_df based on Gene_Symbol using .map
original_stat_filtered_df['Verified'] = batch_compile_df['Gene_Symbol'].map(gene_symbol_to_verified)
# Get the index of the "Gene_Symbol" column
gene_symbol_index = original_stat_filtered_df.columns.get_loc('Gene_Symbol')
# Create a list of columns with the "Verified" column moved to the desired position
new_columns = list(original_stat_filtered_df.columns)
new_columns.insert(gene_symbol_index + 1, 'Verified')
# Reorder the columns in the original_stat_filtered_df DataFrame
original_stat_filtered_df = original_stat_filtered_df[new_columns]
# Drop any duplicate "Verified" columns
original_stat_filtered_df = original_stat_filtered_df.loc[:, ~original_stat_filtered_df.columns.duplicated()]
# Filter the Verified Columns
verified_stat_filtered_df = original_stat_filtered_df[original_stat_filtered_df['Verified'] == 1]
# Convert to gene sets
verified_genes = set(verified_stat_filtered_df['Gene_Symbol'])
Original_genes = set(verified_hORFeome9_1['entrez_gene_symbol'])
# Calculate the intersections
intersection = verified_genes.intersection(Original_genes)
only_verified_genes = verified_genes - Original_genes
only_original_genes = Original_genes - verified_genes
# Create the Venn diagram
fig, ax = plt.subplots(figsize=(16, 12)) # Adjust figure size to provide space for labels
venn = venn2(subsets=(len(only_verified_genes), len(only_original_genes), len(intersection)),
set_labels=('Analyzed Genes', 'hORFeome 9.1'), # <<< --- THIS HIDES THE 'A' AND 'B' LABELS
# set_colors=("#BABDFF", '#CEFFBA'),
ax=ax) # Ensure ax is passed if you have a custom figure/axes
# color_left_only = '#FFC0CB' # Pink
color_right_only = '#BAFFF8' # Light Blue
color_intersection = '#FFD0BA' # Medium Purple
# if venn.get_patch_by_id('10'):
# venn.get_patch_by_id('10').set_facecolor(color_left_only)
# venn.get_patch_by_id('10').set_alpha(1.0) # Ensure full opacity if desired
if venn.get_patch_by_id('01'):
venn.get_patch_by_id('01').set_facecolor(color_right_only)
venn.get_patch_by_id('01').set_alpha(1.0) # Ensure full opacity if desired
if venn.get_patch_by_id('11'):
venn.get_patch_by_id('11').set_facecolor(color_intersection)
venn.get_patch_by_id('11').set_alpha(1.0) # Ensure full opacity if desired
# subset_ids = ['10', '01', '11'] # '10': only A, '01': only B, '11': intersection
subset_ids = ['10'] # '10': only A, '01': only B, '11': intersection
for subset_id in subset_ids:
label = venn.get_label_by_id(subset_id)
if label: # Check if the label exists
label.set_visible(False) # Set its visibility to False
subsets = ['01', '11']
for subset in subsets:
number_label = venn.get_label_by_id(subset)
if number_label: # Check if the label exists
number_label.set_fontsize(30)
# # Add custom labels with boxes outside the Venn diagram
# plt.text(-1.5, 0.5, 'Verified Genes\nfrom analysis', fontsize=12,
# bbox=dict(facecolor='cyan', edgecolor='black', boxstyle='round,pad=0.5'))
# plt.text(1.0, 0.5, 'Verified Genes\nfrom hORFeome 9.1', fontsize=12,
# bbox=dict(facecolor='lightblue', edgecolor='black', boxstyle='round,pad=0.5'))
# # Calculate the percentage of interaction
intersection_percentage = (len(intersection) / len(Original_genes)) * 100
# Add the percentage text inside the plot with a box around it
plt.text(0.5, 0.65, f"Common Verified Genes: \n {intersection_percentage:.1f}%",
transform=plt.gca().transAxes,
horizontalalignment='center', verticalalignment='center', fontsize=30, fontweight='bold',
# bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5')
)
label_A = venn.get_label_by_id('A')
label_B = venn.get_label_by_id('B')
if label_A:
# Example: Move 'Analyzed Genes' (label A) slightly left and up from its default
# You'll need to experiment with these (x,y) coordinates
# label_A.set_position((-0.6, 0.1)) # (x, y) relative to diagram's center (0,0)
label_A.set_fontsize(30) # You can also change font size here
label_A.set_fontweight('bold') # Make it bold for emphasis
if label_B:
# Example: Move 'hORFeome 9.1' (label B) slightly right and up from its default
# label_B.set_position((0.65, 0.1))
label_B.set_fontsize(30) # And here
label_B.set_fontweight('bold') # Make it bold for emphasis
# Add a title
# plt.title("Venn Diagram for Verified Genes", fontsize=14)
# Define the TPM Histogram graph file path for graph storing
Verified_Genes_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Venn_Diagram(Verified_Genes).svg")
plt.savefig(Verified_Genes_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Verified_Genes_path}")
# Show the plot
plt.show()
# plt.close()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Venn_Diagram(Verified_Genes).svg
Automative statistical analysis pipeline¶
InĀ [88]:
# Assign handle - only when its necessary
statistical_analysis_handle = ""
# Allocate the below library (mix with other library)
from scipy.stats import chi2, t, norm
# Filter out numpy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
# Getting right database (inclusion of all triplets) - Only getting column header not the data
triplet_database_path = os.path.join(database_files_original, f"{directory}_nor_clean_removed.xlsx")
triplet_database = pd.read_excel(triplet_database_path)
# Make the columns triplets
triplet_to_include = [column for column in triplet_database.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
Triplet_maker = triplet_database[triplet_to_include]
names = Triplet_maker.columns.values
control = ["DMSO"]
Baseline = ["Baseline"]
drug_name_list = []
baseline_dmso_columns = [column for column in names if any(name in column for name in control)
or any(name in column for name in Baseline)]
selected_columns = [column for column in names if any(name in column for name in name_list)
or any(name in column for name in control)]
# triplets = [selected_columns[i:i + 3] for i in range(0, len(selected_columns), 3)]
triplets = list(OrderedDict((key, [item for item in selected_columns if item.split('-')[1] == key]) for key in [item.split('-')[1] for item in selected_columns]).values())
# To divide column names by DMSO controls
def DMSO_list(lst, word):
result = []
current_sublist = []
for item in lst:
if word in item[0]:
if current_sublist:
result.append(current_sublist)
current_sublist = []
current_sublist.append(item)
if current_sublist:
result.append(current_sublist)
return result
divided_triplets = DMSO_list(triplets, "DMSO")
def drug_to_remove(divided_triplets, drug_to_exclude):
updated_divided_triplets = []
for sublist in divided_triplets:
updated_sublist = []
for inner_list in sublist:
updated_inner_list = [drug for drug in inner_list if not any(exclude in drug for exclude in drug_to_exclude)]
if updated_inner_list:
updated_sublist.append(updated_inner_list)
if updated_sublist:
updated_divided_triplets.append(updated_sublist)
return updated_divided_triplets
drug_to_exclude = ['61-Vinblastine-B', '51-Irinotecan-B', '59-6mercaptopurine-C']
updated_divided_triplets = drug_to_remove(divided_triplets, drug_to_exclude)
def separate_batches(batch_list):
control_samples = []
experimental_samples = []
for i, batch in enumerate(batch_list, start=1):
# Initialize lists for control and experimental samples within the batch
batch_control = []
batch_experimental = []
for sub_batch in batch:
# Check if the sub-batch contains "DMSO"
if any("DMSO" in item for item in sub_batch):
# Append the triplet to control samples
batch_control.extend(sub_batch)
else:
# Append each triplet of drugs to experimental samples
triplet_list = [sub_batch[j:j + 3] for j in range(0, len(sub_batch), 3)]
batch_experimental.extend(triplet_list)
# Append control and experimental samples within the batch
control_samples.append((f"Batch {i} control_samples =", batch_control))
experimental_samples.append((f"Batch {i} experimental_samples =", batch_experimental))
return control_samples, experimental_samples
# Example usage
control_samples, experimental_samples = separate_batches(updated_divided_triplets)
def extract_name(input_list):
drug_names = set()
for item in input_list:
parts = item.split('-')
if len(parts) == 3 and parts[1].isalnum():
drug_names.add(parts[1])
return ' '.join(drug_names)
# Permutation testing initiation
Iter_num = 10000
min_n_sample_for_testing = 2
# Create a copy of batch_compile_stat_df
meanFC_allpval_df = batch_compile_stat_df.copy()
meanFC_allpval_df = meanFC_allpval_df.reset_index()
only_meanFC_allpval_df = batch_compile_stat_df.copy()
only_meanFC_allpval_df = only_meanFC_allpval_df.iloc[:, :4]
only_meanFC_allpval_df = only_meanFC_allpval_df.reset_index()
# Print the results
for i, (control_sample_name, control_sample_data), (experimental_sample_name, experimental_sample_data) in zip(
range(1, len(control_samples) + 1), control_samples, experimental_samples):
for exp_sample in experimental_sample_data:
drug_name = extract_name(exp_sample)
drug_name_list.append(drug_name)
if statistical_analysis_handle == "Yes":
print(drug_name)
nullFC_list = []
pvalue_list = []
FC_list = []
nan_no_list = []
# Access the DataFrame using the individual column names
tested_columns = np.array(batch_compile_stat_df[control_sample_data + exp_sample])
tested_columns = np.array([[float(val) if val != 'nan' else np.nan for val in row] for row in tested_columns],
dtype=float)
[row_num, col_num] = tested_columns.shape
treat_num = len(exp_sample)
control_num = len(control_sample_data)
trnd = np.zeros([row_num, Iter_num])
wrnd = np.zeros([row_num, Iter_num])
mrnd = np.zeros([row_num, Iter_num])
wzrnd = np.zeros([row_num, Iter_num])
[trnd, wrnd, mrnd, wzrnd] = permutation_test(tested_columns, treat_num, control_num, Iter_num)
nullFC_list.append(pd.DataFrame(mrnd).stack(level=-1, dropna=False).values)
x = tested_columns[:, control_num:] # treatment group
y = tested_columns[:, :control_num] # control group
nonnan_count_x = np.isfinite(x).sum(axis=1)
nonnan_count_y = np.isfinite(y).sum(axis=1)
nonnan_bool_x = np.array([np.nan if i < min_n_sample_for_testing else 1 for i in nonnan_count_x])
nonnan_bool_y = np.array([np.nan if i < min_n_sample_for_testing else 1 for i in nonnan_count_y])
nonnan_bool = nonnan_bool_x * nonnan_bool_y
nan_no_list.append([0 if pd.isnull(i) else 1 for i in nonnan_bool])
[pt, pw, pm, pwz, ovp1] = pval_calc(trnd, wrnd, mrnd, wzrnd, x, y, True, 1)
[pt, pw, pm, pwz, ovp1] = [pt, pw, pm, pwz, ovp1] * nonnan_bool
[pt, pw, pm, pwz, ovp3] = pval_calc(trnd, wrnd, mrnd, wzrnd, x, y, True, 3)
[pt, pw, pm, pwz, ovp3] = [pt, pw, pm, pwz, ovp3] * nonnan_bool
pvalue_list.append([pt, pw, pm, pwz, ovp1, ovp3])
FC = np.nanmean(x, axis=1) - np.nanmean(y, axis=1)
FC = FC * nonnan_bool
FC_list.append(FC)
# Extract the drug name
testing_var = ['nan_filter', 'FC', 'pt', 'pw', 'pm', 'pwz', 'ovp1', 'ovp3']
df_testing_result = pd.concat([pd.concat([pd.DataFrame(nan_no_list[idx]).T, pd.DataFrame(FC_list[idx]).T, pd.DataFrame(i)], axis=0) for idx,i in enumerate(pvalue_list)], axis = 1).T
df_testing_result.columns = [f"{var}_{drug_name}" for var in testing_var]
meanFC_allpval_df = pd.concat([meanFC_allpval_df, df_testing_result], axis=1)
only_meanFC_allpval_df = pd.concat([only_meanFC_allpval_df, df_testing_result], axis=1)
if statistical_analysis_handle == "Yes":
meanFC_allpval_df.set_index('index', inplace=True)
meanFC_allpval_df.index.name = None
only_meanFC_allpval_df.set_index('index', inplace=True)
only_meanFC_allpval_df.index.name = None
# Saving the DataFrame
meanFC_allpval_Path = os.path.join(database_files_stats, f"{directory}_all_meanFC_allpval.xlsx")
meanFC_allpval_df.to_excel(meanFC_allpval_Path, index=False)
only_meanFC_allpval_Path = os.path.join(database_files_stats, f"{directory}_only_meanFC_allpval.xlsx")
only_meanFC_allpval_df.to_excel(only_meanFC_allpval_Path, index=False)
else:
print("Statiscial analysis handle is not set")
only_meanFC_allpval_Path = os.path.join(database_files_stats, f"{directory}_only_meanFC_allpval.xlsx")
# After the code, you can reset the warnings to their default behavior
warnings.resetwarnings()
Statiscial analysis handle is not set
InĀ [89]:
only_meanFC_allpval_df = pd.read_excel(only_meanFC_allpval_Path)
# List of p_values to include
p_values = ['pt', 'pw', 'pm', 'pwz', 'ovp1', 'ovp3']
# Columns to select for each drug
columns_to_select = ['ORF_ID', 'NCBI', 'Group', 'Gene_Symbol']
# Iterate over each p-value
for p_value in p_values:
# Create an empty DataFrame to store the results for this p-value
result_df = only_meanFC_allpval_df[columns_to_select]
# Iterate over each drug name and create a dataframe for each drug with the selected p-value
for drug_name in drug_name_list:
# Select columns that contain drug_name and the specified p-value
relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'{p_value}_{drug_name}']
# Filter the DataFrame to include only the relevant columns
relevant_df = only_meanFC_allpval_df[relevant_cols].copy()
# Append the relevant data for this drug to the result DataFrame
result_df = pd.concat([result_df, relevant_df], axis=1)
file_name = f'{directory}_all_meanFC_{p_value}.xlsx'
p_value_path = os.path.join(database_files_stats, file_name)
result_df.to_excel(p_value_path, index=False)
Saving separate P value dataframes¶
Evaluation of mutiple P values¶
InĀ [90]:
# List of p_values to include
p_values = ['pt', 'pw', 'pm', 'pwz', 'ovp1', 'ovp3']
# Initialize a dictionary to store counts for each drug and p-value
counts_per_drug_per_pvalue = {drug_name: {p_value: 0 for p_value in p_values} for drug_name in drug_name_list}
# Iterate over each p-value
for p_value in p_values:
# Iterate over each drug name
for drug_name in drug_name_list:
# Select relevant columns for the specified drug and p-value
relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'{p_value}_{drug_name}']
# Filter the DataFrame based on specified conditions
filtered_df = only_meanFC_allpval_df[
(only_meanFC_allpval_df[relevant_cols[0]] == 1) &
((only_meanFC_allpval_df[relevant_cols[1]] > 0.5) | (only_meanFC_allpval_df[relevant_cols[1]] < -0.5)) &
(only_meanFC_allpval_df[f'{p_value}_{drug_name}'] < 0.05)
]
# Count the number of genes in the filtered DataFrame
gene_count = filtered_df.shape[0]
# Update the counts for this drug and p-value
counts_per_drug_per_pvalue[drug_name][p_value] = gene_count
# Create subplots for each p-value
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 16))
axes = axes.flatten()
for i, p_value in enumerate(p_values):
drug_names = list(counts_per_drug_per_pvalue.keys())
counts = [counts_per_drug_per_pvalue[drug_name][p_value] for drug_name in drug_names]
# Get the colors for the bars based on the drug_color_map
colors = [drug_color_map[drug_name] for drug_name in drug_names]
ax = axes[i]
bars = ax.bar(drug_names, counts, color=colors)
# Set the background to white and remove the edges
ax.set_facecolor('white')
for bar in bars:
bar.set_edgecolor('none')
ax.set_ylabel('Number of Genes', fontsize=12)
ax.set_title(f'Filtered genes for {p_value}', fontsize=20)
ax.tick_params(axis='x', rotation=90, labelsize=11)
ax.tick_params(axis='y', labelsize=12)
# Remove empty subplots if any
for i in range(len(p_values), len(axes)):
fig.delaxes(axes[i])
# Define the file path for storing graph
stat_test_path = os.path.join(graphs_files_stats, f"{directory}_pval_filter.svg")
plt.savefig(stat_test_path, format='svg', bbox_inches='tight', dpi=300)
print(f"{directory}_pval_filter.svg to {stat_test_path}")
plt.tight_layout()
plt.show()
RQ023682_pval_filter.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_pval_filter.svg
InĀ [91]:
# Make a copy of the original dictionary
drug_categories = drug_category.copy()
color_categories = category_colors.copy()
# Check if 'Control' key exists in the dictionary and remove it if present
if 'Control' in drug_categories:
del drug_categories['Control']
if 'Control' in color_categories:
del color_categories['Control']
Cumulative Number of Filtered Genes as Number of Drugs increases¶
InĀ [92]:
# Path to the DataFrame file
Final_pval_path = os.path.join(database_files_stats, f"{directory}_all_meanFC_ovp3.xlsx")
# Read the DataFrame
Final_pval = pd.read_excel(Final_pval_path)
# Dictionary to store filtered DataFrames for each drug
filter_dfs = {}
# Iterate over each drug and filter the DataFrame
for drug_name in drug_name_list:
relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'ovp3_{drug_name}']
# Filter the DataFrame based on specified conditions for the current drug
filtered_df = Final_pval[
(Final_pval[relevant_cols[0]] == 1) &
((Final_pval[relevant_cols[1]] > 0.5) | (Final_pval[relevant_cols[1]] < -0.5)) &
(Final_pval[f'ovp3_{drug_name}'] < 0.05)
]
# Store the unique genes (rows) for the current drug
filter_dfs[drug_name] = set(filtered_df['NCBI'])
# Sort the drugs by the size of their filtered sets
sorted_drugs = sorted(filter_dfs, key=lambda x: len(filter_dfs[x]),reverse=True)
# Calculate the cumulative count of genes for each cumulative drug count
cumulative_gene_counts = []
# Start with the smallest set of genes
smallest_genes = filter_dfs[sorted_drugs[0]]
cumulative_gene_counts.append(len(smallest_genes))
# Iterate over the drugs, starting from the second smallest
for drug_name in sorted_drugs[1:]:
# Add genes from the next drug while avoiding duplicates
smallest_genes.update(filter_dfs[drug_name])
cumulative_gene_counts.append(len(smallest_genes))
# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(drug_name_list) + 1), cumulative_gene_counts, marker='o', color='black')
plt.xlabel('Cumulative Drug Count (Sorted by Gene Count)')
plt.ylabel('Cumulative Gene Count')
plt.title('Number of Cumulative Genes')
plt.xticks(range(1, len(drug_name_list) + 1))
plt.grid(True)
# Format y-axis labels with a comma for thousands separator
plt.gca().get_yaxis().set_major_formatter(mticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.show()
InĀ [93]:
from scipy import stats
# Function to perform permutation testing by swapping genes within samples (treatments)
def permuted_cumulative_gene_count(sorted_drugs, filter_dfs):
genes = set()
cumulative_gene_counts = []
for drug_name in sorted_drugs:
# Get the set of genes for the current drug
drug_genes = list(filter_dfs[drug_name])
# Shuffle the order of genes within the drug (sample)
permuted_genes = random.sample(drug_genes, len(drug_genes))
genes.update(permuted_genes)
cumulative_gene_counts.append(len(genes))
return cumulative_gene_counts
num_permutations = 1000 # You can adjust the number of permutations as needed
permuted_counts = []
for _ in range(num_permutations):
random.shuffle(sorted_drugs) # Randomly shuffle the drug order
permuted_counts.append(permuted_cumulative_gene_count(sorted_drugs, filter_dfs))
# Calculate the degrees of freedom for the t-distribution
degrees_of_freedom = num_permutations - 1
confidence_level = 0.9999
t_critical = abs(stats.t.ppf((1 - confidence_level) / 2, degrees_of_freedom))
mean_counts = np.mean(permuted_counts, axis=0)
std_dev = np.std(permuted_counts, axis=0)
standard_error = std_dev / np.sqrt(num_permutations)
upper_bound = mean_counts + t_critical * standard_error # 95% confidence interval
lower_bound = mean_counts - t_critical * standard_error
plt.figure(figsize=(16,8))
ax = plt.gca()
plt.plot(range(1, len(drug_name_list) + 1), mean_counts, marker='o', color='black', label='Set of Genes', linewidth=3, markersize=10)
plt.fill_between(range(1, len(drug_name_list) + 1), lower_bound, upper_bound, color='#F6CFFC', alpha=1, label='95% CI')
plt.xlabel('Cumulative Number of chemotherapeutics')
plt.ylabel('Cumulative Number of Genes')
plt.title(' ')
plt.xticks(range(1, len(drug_name_list) + 1))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.grid(False)
plt.gca().get_yaxis().set_major_formatter(mticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.legend(frameon=False)
# Define the TPM Histogram graph file path for graph storing
Permuted_Genes_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Permuted_Genes.svg")
plt.savefig(Permuted_Genes_path , format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Permuted_Genes_path}")
plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Permuted_Genes.svg
Numbers of Drugs pass the filters in Genes¶
InĀ [94]:
# Path to the DataFrame file
Final_pval_path = os.path.join(database_files_stats, f"{directory}_all_meanFC_ovp3.xlsx")
# Read the DataFrame
Final_pval = pd.read_excel(Final_pval_path)
# Create a DataFrame to store the filter results (True/False for each data point)
filter_results = pd.DataFrame(index=Final_pval.index)
# Iterate over each drug and check if each data point passes the filter
for drug_name in drug_name_list:
relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'ovp3_{drug_name}']
# Apply the filter conditions and store the results in the filter_results DataFrame
filter_results[drug_name] = (
(Final_pval[relevant_cols[0]] == 1) &
(
(Final_pval[relevant_cols[1]] >= 0.5)
|
(Final_pval[relevant_cols[1]] <= -0.5)
)
&
(Final_pval[f'ovp3_{drug_name}'] <= 0.05)
)
# Count how many 'True' values there are per row in the filter_results DataFrame
num_drugs_passed_filter = filter_results.sum(axis=1)
# Count how many genes have a specific number of drugs passing the filter (0 to 16)
count_per_num_drugs = num_drugs_passed_filter.value_counts().sort_index()
# Create a list to store the counts of "True" values in each row
true_counts = filter_results.apply(lambda row: row.sum(), axis=1)
# Find the rows where the count of "True" values is greater than or equal to 7
selected_rows = filter_results[true_counts >= 7]
# Create an empty DataFrame to store the count of "True" values for each category
category_counts = pd.DataFrame(index=selected_rows.index)
for category, drugs in drug_categories.items():
# Count the number of "True" values for each drug category and assign the category name as the column header
category_counts[category] = selected_rows[drugs].sum(axis=1)
# Create a condition for at least three categories with at least two "True" values
final_condition = (category_counts >= 1).sum(axis=1) >= 3
# Apply the final condition to select rows
reselected_rows = selected_rows[final_condition]
core_indices = reselected_rows.index
# Finding Core Gene NCBI
Core_Genes = Final_pval.loc[core_indices, 'NCBI']
Core_Genes = sorted(list(Core_Genes))
# Get unique sorted values from num_drugs_passed_filter
sorted_keys = sorted(num_drugs_passed_filter.unique())
# Initialize an empty dictionary to store the results
Number_drugs_dict = {}
# Iterate through the sorted keys
for key in sorted_keys:
# Get the corresponding 'NCBI' values for this key
ncbi_values = Final_pval.loc[num_drugs_passed_filter == key, 'NCBI'].tolist()
# Sort the 'NCBI' values within the key
ncbi_values.sort()
# Add the key-value pair to the results dictionary
Number_drugs_dict[key] = ncbi_values
# Additional column indicating what drugs that has passed filter in particular genes
num_drugs_passed_filter_with_drugs = num_drugs_passed_filter.index.to_series().apply(
lambda index: '|'.join(filter_results.columns[filter_results.loc[index]]))
num_drugs_passed_filter_with_drugs.index = num_drugs_passed_filter.index
# cmap = plt.cm.cool
cmap = plt.cm.copper
num_bars = len(count_per_num_drugs)
colors = [cmap(i) for i in np.linspace(0, 1, num_bars)]
# Create a histogram plot
plt.figure(figsize=(16,8))
ax = plt.gca()
plt.bar(count_per_num_drugs.index, count_per_num_drugs.values, align='center', alpha=0.7, color=colors)
plt.yscale('log') # Set y-axis to log scale
plt.xlabel('Number of Chemothrapeuics with DEGs')
plt.ylabel('Number of Genes')
plt.gca().get_yaxis().set_major_formatter(mticker.FuncFormatter(lambda x, p: format(int(x), ',')))
# plt.title('Histogram of Genes by Number of Drugs Passing Filter')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xticks(range(len(drug_name_list)+1))
# Define the TPM Histogram graph file path for graph storing
Filtered_Genes_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Filtered_Genes.svg")
plt.savefig(Filtered_Genes_path , format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Filtered_Genes_path }")
plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Filtered_Genes.svg
InĀ [95]:
# Additional column indicating what drugs passed filter with direction (ā/ā)
def get_passed_drugs_with_arrows(index):
passed_drugs = []
for drug in filter_results.columns:
if filter_results.loc[index, drug]:
fc = Final_pval.loc[index, f'FC_{drug}']
arrow = 'ā' if fc >= 0.5 else 'ā'
passed_drugs.append(f"{drug}{arrow}")
return '|'.join(passed_drugs)
num_drugs_passed_filter_with_drugs = num_drugs_passed_filter.index.to_series().apply(get_passed_drugs_with_arrows)
num_drugs_passed_filter_with_drugs.index = num_drugs_passed_filter.index
InĀ [96]:
# Define the observed data
observed_data = np.arange(len(count_per_num_drugs)) # The x-values (0 to 13)
counts = np.array(count_per_num_drugs) # The corresponding counts
# Define the probability density function for the mixture distribution
def mixture_pdf(x, params):
weight_gamma, alpha, beta, mu, sigma = params
gamma_pdf = weight_gamma * gamma.pdf(x, alpha, scale=1/beta)
normal_pdf = (1 - weight_gamma) * norm.pdf(x, loc=mu, scale=sigma)
return gamma_pdf + normal_pdf
# Define the negative log-likelihood function to be minimized
def negative_log_likelihood(params):
return -np.sum(counts * np.log(mixture_pdf(observed_data, params)))
# Initial guess for parameters
initial_params = [0.5, 1, 1, 6, 3] # Adjust initial parameters as needed
# Define bounds for the parameters
bounds = [(0, 1), (0, None), (0, None), (0, None), (0, None)]
# Minimize the negative log-likelihood to estimate parameters
result = minimize(negative_log_likelihood, initial_params, bounds=bounds)
# Get the estimated parameters
estimated_params = result.x
# Generate x values for the plot
x = np.arange(0, 14, 0.1)
# Plot the observed data and the fitted mixture distribution
plt.figure(figsize=(8, 6))
plt.bar(observed_data, counts, color='black', alpha=0.7, label='Observed Data')
plt.plot(x, counts.sum() * mixture_pdf(x, estimated_params), 'r-', label='Mixture Distribution', linewidth=2)
# plt.yscale('log')
plt.xlabel('Number of Drugs Passing Filter')
plt.ylabel('Gene Count')
plt.title('Mixture Distribution Fit to Observed Data')
plt.legend()
plt.show()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/scipy/optimize/_numdiff.py:596: RuntimeWarning: invalid value encountered in subtract
InĀ [97]:
# Define the probability density function for the mixture distribution
def mixture_pdf(x, params):
weight_gamma, alpha, beta, mu, sigma = params
gamma_pdf = weight_gamma * gamma.pdf(x, alpha, scale=1/beta)
normal_pdf = (1 - weight_gamma) * norm.pdf(x, loc=mu, scale=sigma)
return gamma_pdf, normal_pdf
# Define the negative log-likelihood function to be minimized
def negative_log_likelihood(params):
gamma_pdf, normal_pdf = mixture_pdf(observed_data, params)
return -np.sum(counts * np.log(gamma_pdf + normal_pdf))
# Initial guess for parameters
initial_params = [0.5, 1, 1, 6, 3] # Adjust initial parameters as needed
# Define bounds for the parameters
bounds = [(0, 1), (0, None), (0, None), (0, None), (0, None)]
# Minimize the negative log-likelihood to estimate parameters
result = minimize(negative_log_likelihood, initial_params, bounds=bounds)
# Get the estimated parameters
estimated_params = result.x
# Calculate the cutoff point by finding the intersection
cutoff = brentq(lambda x: mixture_pdf(x, estimated_params)[0] - mixture_pdf(x, estimated_params)[1], 0, 14)
# Generate x values for the plot
x = np.arange(0, 14, 0.1)
# Create a subplot with 2 rows and 1 column
plt.figure(figsize=(30, 12))
# Plot the Gamma component in the first subplot
plt.subplot(1, 2, 1)
gamma_pdf, normal_pdf = mixture_pdf(x, estimated_params)
plt.plot(x, counts.sum() * gamma_pdf, 'r-', label='Gamma Distribution', linewidth=2)
plt.xlabel('Number of Drugs Passing Filter')
plt.ylabel('Probability Density')
plt.title('Gamma Distribution')
plt.legend()
# Plot the Normal component in the second subplot
plt.subplot(1, 2, 2)
plt.plot(x, counts.sum() * normal_pdf, 'b-', label='Normal Distribution', linewidth=2)
plt.xlabel('Number of Drugs Passing Filter')
plt.ylabel('Probability Density')
plt.title('Normal Distribution')
plt.legend()
# Create a subplot with 2 rows and 1 column
plt.figure(figsize=(16, 16))
ax = plt.gca()
# Plot the Gamma component
plt.plot(x, counts.sum() * gamma_pdf, 'r-', label='Gamma Distribution', linewidth=10)
# Plot the Normal component
plt.plot(x, counts.sum() * normal_pdf, 'b-', label='Normal Distribution', linewidth=10)
# Plot the cutoff point
plt.axvline(cutoff, color='black', linestyle='--', label='Cutoff Point', linewidth=5)
plt.xlabel('Number of Drugs Passing Filter')
plt.ylabel('Probability Density')
plt.title('')
plt.legend( loc='upper right')
plt.yticks(np.arange(0, 8001, 2000))
# # Format the cutoff value to 3 significant figures
# cutoff_text = f"Cutoff: {cutoff:.3g}"
# # Get the current y-axis limits to place the text appropriately
# ymin, ymax = plt.ylim()
# # Choose a y-position for the text (e.g., 90% of the way up the y-axis)
# text_y_position = ymin + (ymax - ymin) * 0.2
# Add the text slightly to the right of the line
# plt.text(cutoff + 0.2, text_y_position, cutoff_text,
# color='black',
# fontsize=plt.rcParams['font.size'] * 1, # Slightly smaller than global font size
# horizontalalignment='left', # Align text to its left
# verticalalignment='center',
# fontweight='bold') # Vertically center the text
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Adjust the spacing between subplots
plt.tight_layout()
# Define the TPM Histogram graph file path for graph storing
Multi_Norm_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Multi_norms.svg")
plt.savefig(Multi_Norm_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Multi_Norm_path}")
# Show the plot
plt.show()
# plt.close()
print("Cutoff Point:", cutoff)
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Multi_norms.svg
Cutoff Point: 2.6434598205529287
Categorical Genes¶
InĀ [98]:
# Assign handle - only when its necessary
categorical_analysis_handle = ""
# Allocate the below library (mix with other library)
from scipy.stats import chi2, t, norm
# Filter out numpy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
# Permutation testing initiation
Iter_num = 10000
min_n_sample_for_testing = 2
# Create a dataframe for categorical analysis
category_df = Final_pval.copy()
# List to store the ordered column names
ordered_columns = []
desired_order = ["Antimetabolite", "DNA cross linking agent", "DNA strand break agent", "Microtubule inhibitor"]
# Iterate through the desired order of drug categories
for category in desired_order:
drugs_in_category = drug_category.get(category, [])
for drug in drugs_in_category:
# Add the FC and pval columns for each drug
ordered_columns.extend([f"FC_{drug}", f"ovp3_{drug}"])
# Reorder the columns in the DataFrame and reset the index
category_df = category_df[ordered_columns]
fc_columns = [col for col in category_df.columns if "FC" in col]
category_df = category_df[fc_columns]
# Create a dataframe for categorical analysis
categorical_df = batch_compile_stat_df.iloc[:, :4]
categorical_df = categorical_df.reset_index()
# Create a dictionary to hold control samples for each category
category_control_samples = {category: [] for category in drug_categories}
for category in drug_categories.keys():
experimental_samples = []
control_samples = []
for name in ordered_columns:
drug_name = name.split('_')[1]
if drug_name in drug_categories[category]:
experimental_samples.append(name)
elif category not in [cat for cat, drugs in drug_categories.items() if drug_name in drugs]:
control_samples.append(name)
if categorical_analysis_handle == "Yes":
print(category)
nullFC_list = []
pvalue_list = []
FC_list = []
nan_no_list = []
# Access the DataFrame using the individual column names
tested_columns = np.array(category_df[control_samples + experimental_samples])
tested_columns = np.array([[float(val) if val != 'nan' else np.nan for val in row] for row in tested_columns],
dtype=float)
[row_num, col_num] = tested_columns.shape
treat_num = len(experimental_samples)
control_num = len(control_samples)
trnd = np.zeros([row_num, Iter_num])
wrnd = np.zeros([row_num, Iter_num])
mrnd = np.zeros([row_num, Iter_num])
wzrnd = np.zeros([row_num, Iter_num])
[trnd, wrnd, mrnd, wzrnd] = permutation_test(tested_columns, treat_num, control_num, Iter_num)
nullFC_list.append(pd.DataFrame(mrnd).stack(level=-1, dropna=False).values)
x = tested_columns[:, control_num:] # treatment group
y = tested_columns[:, :control_num] # control group
nonnan_count_x = np.isfinite(x).sum(axis=1)
nonnan_count_y = np.isfinite(y).sum(axis=1)
nonnan_bool_x = np.array([np.nan if i < min_n_sample_for_testing else 1 for i in nonnan_count_x])
nonnan_bool_y = np.array([np.nan if i < min_n_sample_for_testing else 1 for i in nonnan_count_y])
nonnan_bool = nonnan_bool_x * nonnan_bool_y
nan_no_list.append([0 if pd.isnull(i) else 1 for i in nonnan_bool])
[pt, pw, pm, pwz, ovp3] = pval_calc(trnd, wrnd, mrnd, wzrnd, x, y, True, 3)
[pt, pw, pm, pwz, ovp3] = [pt, pw, pm, pwz, ovp3] * nonnan_bool
pvalue_list.append([ovp3])
FC = np.nanmedian(x, axis=1) - np.nanmedian(y, axis=1)
FC = FC * nonnan_bool
FC_list.append(FC)
# Extract the drug name
testing_var = ['nan_filter', 'FC', 'ovp3']
df_testing_result = pd.concat([pd.concat([pd.DataFrame(nan_no_list[idx]).T, pd.DataFrame(FC_list[idx]).T, pd.DataFrame(i)], axis=0) for idx,i in enumerate(pvalue_list)], axis = 1).T
# df_testing_result.columns = [f"{var}_{drug}" for var in testing_var]
df_testing_result.columns = [f"{var}_{category}" for var in testing_var]
categorical_df = pd.concat([categorical_df, df_testing_result], axis=1)
if categorical_analysis_handle == "Yes":
categorical_df.set_index('index', inplace=True)
categorical_df.index.name = None
# Saving the DataFrame
categorical_Path = os.path.join(database_files_stats, f"{directory}_categorical_analysis.xlsx")
categorical_df.to_excel(categorical_Path , index=False)
else:
print("Categorical analysis handle is not set")
# After the code, you can reset the warnings to their default behavior
warnings.resetwarnings()
Categorical analysis handle is not set
InĀ [99]:
# Recalling Categorical & Peripheral dataframes
categorical_Path = os.path.join(database_files_stats, f"{directory}_categorical_analysis.xlsx")
categorical_df = pd.read_excel(categorical_Path)
## Modifying the dataframes
# Remove columns containing 'nan_filter'
categorical_mod_df = categorical_df.drop(columns=[col for col in categorical_df.columns if 'nan_filter' in col])
# Fill NaN values in 'FC' columns with 0
categorical_fc_columns = [col for col in categorical_mod_df.columns if 'FC' in col]
categorical_mod_df[categorical_fc_columns] = categorical_mod_df[categorical_fc_columns].fillna(0)
# Replace NaN values in 'ovp3' columns with 1
categorical_ovp3_columns = [col for col in categorical_mod_df.columns if 'ovp3' in col]
categorical_mod_df[categorical_ovp3_columns] = categorical_mod_df[categorical_ovp3_columns].fillna(1)
# Replace 'ovp3' with 'pval' in column names
categorical_mod_df.columns = [col.replace('ovp3', 'pval') for col in categorical_mod_df.columns]
InĀ [100]:
# Dictionary to store filtered DataFrames for each drug
category_keys = list(drug_categories.keys())
# Create a figure and axes
fig, ax = plt.subplots(figsize=(15, 15))
# Iterate over each drug and filter the DataFrame
for category_name in category_keys:
fc_column = f'FC_{category_name}'
data = categorical_mod_df[fc_column]
ax.hist(data, bins=20, alpha=0.5, label=f'FC_{category_name}')
# Set plot titles and labels
ax.set_title('Histogram of FC Values for Categories')
ax.set_xlabel('FC Values')
ax.set_ylabel('Frequency')
# Add a legend
ax.legend()
# Display the plot
plt.show()
Number of Unique Genes per Categories¶
InĀ [101]:
# Dictionary to store filtered DataFrames for each drug
category_keys = list(drug_categories.keys())
category_dfs = {}
category_dfs_no = {}
category_dfs_select = {}
category_dfs_select_no = {}
# Iterate over each drug and filter the DataFrame
for category_name in category_keys:
relevant_cols = [f'FC_{category_name}', f'pval_{category_name}']
# Filter the DataFrame based on specified conditions for the current drug
category_df = categorical_mod_df[
(
(categorical_mod_df[relevant_cols[0]] >= 1.0)
|
(categorical_mod_df[relevant_cols[0]] <= -1.0)
)
&
(categorical_mod_df[relevant_cols[1]] <= 0.01)
]
# Store the unique genes (rows) for the current drug
category_dfs[category_name] = set(category_df['NCBI'])
category_dfs_no[category_name] = len(category_df['NCBI'])
# Store the unique genes (rows) for the current drug in a set
selected_genes = set(category_df['NCBI'])
# Check for other drugs and remove genes with p-value < 0.05 for other drugs
for other_category in category_keys:
if other_category != category_name:
other_cols = [f'FC_{other_category}', f'pval_{other_category}']
genes_to_remove = categorical_mod_df[
(
(categorical_mod_df[other_cols[0]] >= 1.0)
|
(categorical_mod_df[other_cols[0]] <= -1.0)
)
&
(
(categorical_mod_df[other_cols[1]] <= 0.01)
)
]['NCBI']
selected_genes.difference_update(set(genes_to_remove))
# Store the selected genes in the dictionary
category_dfs_select[category_name] = set(selected_genes) - set(Core_Genes)
category_dfs_select_no[category_name] = len(category_dfs_select[category_name])
# Create a bar plot for the number of unique genes in each category with specified colors
plt.figure(figsize=(16, 8))
ax = plt.gca()
for i, (category, color) in enumerate(color_categories.items()):
plt.bar(i, category_dfs_select_no.get(category, 0), color=color, label=category)
# plt.xlabel('Chemotherapeutics Categories')
plt.xlabel('')
plt.ylabel('Number of Unique Genes')
plt.title('')
# plt.xticks(range(len(color_categories)), list(drug_categories.keys()), rotation=45)
plt.xticks([])
# legend = plt.legend(loc='upper left', bbox_to_anchor=(1, 5), title='Categories')
# plt.legend()
plt.legend(loc='upper left', bbox_to_anchor=(1, 1.027), fontsize=25, title= 'Chemotherapeutics Categories', title_fontsize=30, frameon=False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
# Define the TPM Histogram graph file path for graph storing
Categorical_graph_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Categorical_Graph.svg")
plt.savefig(Categorical_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Categorical_graph_path}")
plt.tight_layout()
plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Categorical_Graph.svg
InĀ [102]:
# Dictionary to store filtered DataFrames for each drug
peripheral_dfs = {}
peripheral_dfs_no = {}
peripheral_dfs_select = {}
peripheral_dfs_select_no = {}
# Iterate over each drug and check if each data point passes the filter
for drug_name in drug_name_list:
relevant_cols = [f'nan_filter_{drug_name}', f'FC_{drug_name}', f'ovp3_{drug_name}']
# Apply the filter conditions and store the results in the filter_results DataFrame
peripheral_df = Final_pval[
(Final_pval[relevant_cols[0]] == 1) &
(
(Final_pval[relevant_cols[1]] >= 0.5)
|
(Final_pval[relevant_cols[1]] <= -0.5)
)
&
(Final_pval[f'ovp3_{drug_name}'] <= 0.05)
]
peripheral_dfs[drug_name] = set(peripheral_df['NCBI'])
peripheral_dfs_no[drug_name] = len(peripheral_df['NCBI'])
# Store the unique genes (rows) for the current drug in a set
selected_genes = set(peripheral_df['NCBI'])
# Check for other drugs and remove genes with p-value < 0.05 for other drugs
for other_category in drug_name_list:
if other_category != drug_name:
other_cols = [f'nan_filter_{other_category}', f'FC_{other_category}', f'ovp3_{other_category}']
genes_to_remove = Final_pval[
(Final_pval[other_cols[0]] == 1) &
(
(Final_pval[other_cols[1]] >= 0.5)
|
(Final_pval[other_cols[1]] <= -0.5)
)
&
(
(Final_pval[other_cols[2]] <= 0.05)
)
]['NCBI']
selected_genes.difference_update(set(genes_to_remove))
# Store the selected genes in the dictionary
peripheral_dfs_select[drug_name] = set(selected_genes - set(Core_Genes))
peripheral_dfs_select_no[drug_name] = len(peripheral_dfs_select[drug_name])
# Create an empty set to store all genes from category_dfs_select
all_category_genes = set()
# Iterate over each category and add its genes to the set
for genes in category_dfs_select.values():
all_category_genes.update(genes)
for drug_name in drug_name_list:
# Remove overlapping genes between peripheral_dfs_select and the current category_genes
peripheral_dfs_select[drug_name] = list(set(peripheral_dfs_select[drug_name]) - all_category_genes)
# Update the length after removing overlapping genes
peripheral_dfs_select_no[drug_name] = len(peripheral_dfs_select[drug_name])
desired_order = ["Antimetabolite", "DNA cross linking agent", "DNA strand break agent", "Microtubule inhibitor"]
# Create a list of tuples with drug names and their corresponding values
drugs_with_values = [(drug, peripheral_dfs_select_no[drug]) for category in desired_order for drug in drug_categories.get(category, [])]
# Sort the list of tuples based on the order in 'drugs_with_values'
sorted_drugs_with_values = sorted(drugs_with_values, key=lambda item: drugs_with_values.index(item))
# Create a new dictionary with the sorted drug-value pairs
sorted_peripheral_dfs_select_no = dict(sorted_drugs_with_values)
# Flatten the list of drugs from the categories
all_drugs = [drug for category in desired_order for drug in drug_categories.get(category, [])]
# Now, you can sort the drugs based on the order in the 'all_drugs' list
sorted_drugs = sorted(all_drugs, key=lambda drug: all_drugs.index(drug))
# Create a bar plot for the number of unique genes in each category with specified colors
plt.figure(figsize=(16, 11.5))
ax = plt.gca()
# Iterate over the drugs in drug_name_list
for i, drug in enumerate(sorted_drugs):
if drug in drug_color_map:
category = drug
color = drug_color_map[drug]
plt.bar(i, sorted_peripheral_dfs_select_no.get(category, 0), color=color, label=category)
plt.xlabel('')
plt.ylabel('Number of Unique Genes', fontsize=40)
# plt.title('Unique Genes per chemotherapies')
plt.title('')
# plt.xticks(range(len(sorted_drugs)), sorted_drugs, ha='right', rotation=45, fontsize=25)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.xticks([])
plt.tight_layout()
plt.legend(loc='upper left', bbox_to_anchor=(1, 1.05), title = 'Chemotherapeutics', fontsize=30, title_fontsize=30, frameon=False)
# Define the TPM Histogram graph file path for graph storing
Peripheral_graph_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Peripheral_Graph.svg")
plt.savefig(Peripheral_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Peripheral_graph_path}")
plt.tight_layout()
plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Peripheral_Graph.svg
InĀ [103]:
# Core Genes to set
Core_Genes = set(Core_Genes)
# Create a set to store the values
category_all_values = set()
# Create a set to store the overlapping values
overlapping_values = set()
for values in category_dfs_select.values():
for value in values:
if value in category_all_values:
overlapping_values.add(value)
else:
category_all_values.add(value)
category_all_values = set(sorted(category_all_values))
# Check if there are overlapping values
if overlapping_values:
print("There are overlapping values between Category.")
print("Overlapping values:", overlapping_values)
else:
print("There are no overlapping values between keys.")
# Create a set to store the values
peripheral_all_values = set()
# Create a set to store the overlapping values
overlapping_values = set()
for values in peripheral_dfs_select.values():
for value in values:
if value in peripheral_all_values:
overlapping_values.add(value)
else:
peripheral_all_values.add(value)
peripheral_all_values = set(sorted(peripheral_all_values))
# Check if there are overlapping values
if overlapping_values:
print("There are overlapping values between Peripheral.")
print("Overlapping values:", overlapping_values)
else:
print("There are no overlapping values between keys.")
There are no overlapping values between keys. There are no overlapping values between keys.
InĀ [104]:
# Initialize dictionaries to store overlapping and non-overlapping genes
filtered_rest_genes_dict = {}
overlapping_genes_dict = {}
# Iterate through the keys and values in Number_drugs_dict
for key, values in Number_drugs_dict.items():
# Find the intersection of the values with all three sets
overlapping_values = set(values) & (set(Core_Genes) | set(category_all_values) | set(peripheral_all_values))
# Find non-overlapping values
non_overlapping_values = set(values) - overlapping_values
# Check if there are overlapping values and non-overlapping values
if overlapping_values:
overlapping_genes_dict[key] = list(overlapping_values)
if non_overlapping_values:
filtered_rest_genes_dict[key] = list(non_overlapping_values)
InĀ [105]:
items_per_key = {key: len(value) for key, value in filtered_rest_genes_dict.items()}
# Print the items per key
print(items_per_key)
multidrug_genes = set()
non_respondent_genes = set()
for key, values in filtered_rest_genes_dict.items():
if key >= 2:
multidrug_genes.update(values)
if key <= 1:
non_respondent_genes.update(values)
{np.int64(0): 2849, np.int64(2): 3117, np.int64(3): 1877, np.int64(4): 953, np.int64(5): 414, np.int64(6): 194, np.int64(7): 1}
InĀ [106]:
from venn import venn
genes = {
"Core" : Core_Genes,
"Category": category_all_values,
"Peripheral": peripheral_all_values,
"Multidrug": multidrug_genes,
"Non-Respondent": non_respondent_genes
}
venn(genes, fontsize=50, legend_loc="upper right", cmap="viridis", figsize=(25,25))
fig = plt.gcf()
# --- REMOVE BOX FROM LEGEND & SET LEGEND TITLE ---
# Get the current axes object from the figure
ax = fig.gca()
# Get the legend object attached to these axes
legend = ax.get_legend()
if legend: # Check if a legend was actually created by the 'venn' function
legend.set_frame_on(False) # Remove the box (frame) around the legend
legend.set_title("Categorization") # Set the desired legend title
legend.get_title().set_fontsize(50) # Example font size for title
legend.get_title().set_fontweight('bold') # Make the title bold
for text_obj in legend.get_texts()[0:6]:
text_obj.set_fontsize(40)
# for patch in ax.patches:
# patch.set_alpha(0.7)
# set_names = list(genes.keys())
# single_set_keys = []
# for i in range(len(set_names)):
# # Create a bitmask for only the i-th set
# bitmask = '0' * i + '1' + '0' * (len(set_names) - 1 - i)
# single_set_keys.append(bitmask)
# print(f"Single set keys to look for: {single_set_keys}")
# Display the diagram
# Define the TPM Histogram graph file path for graph storing
Venn5_graph_path = os.path.join(graphs_files_stats, f"{directory}_Stats_Venn5_Graph.svg")
plt.savefig(Venn5_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Venn5_graph_path}")
plt.tight_layout()
plt.show()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Venn5_Graph.svg
Resetting Log2 Fold Change & P Value dataframe¶
InĀ [107]:
# Calling right dataframe for FC and pval
Final_pval_path = os.path.join(database_files_stats, f"{directory}_all_meanFC_ovp3.xlsx")
Final_pval = pd.read_excel(Final_pval_path)
# Preparation for other indexing than FC or pval data
Final_pval_index = Final_pval.iloc[:, :4]
verified_columns = hORFeome9_1[['entrez_gene_id', 'Verified']]
# Merge the DataFrames on 'NCBI'
verified_index = pd.merge(Final_pval_index, verified_columns, left_on='NCBI', right_on='entrez_gene_id', how='left')
verified_index = verified_index.drop("entrez_gene_id", axis=1)
verified_index['Verified'] = verified_index['Verified'].fillna(0).astype(int)
verified_index = verified_index.set_index(batch_compile_stat_df.index)
verified_index = verified_index.reset_index()
# Get data for calculating silencing effect
silencing = ["mCherryPositive&BFPNegative", "mCherryNegative&BFPNegative"]
# Filter the DataFrame based on silencing terms in column names
silencing_df = batch_compile_stat_df.loc[:, batch_compile_stat_df.columns.str.contains('|'.join(silencing))]
silencing_compile_df = silencing_df.copy()
# Subtract the values and store the result in a new column using .loc
silencing_compile_df.loc[:, 'Silencing Ratio(log)'] = silencing_df['63-mCherryPositive&BFPNegative'] - silencing_df['64-mCherryNegative&BFPNegative']
# Define a function to determine silencing status based on the ratio
def get_silencing_status(ratio):
if np.isnan(ratio):
return 'Not Found'
elif ratio >= -3:
return 'No Silence'
else:
return 'Silenced'
# Apply the function to create the "Silencing" column
silencing_compile_df['Silencing'] = silencing_compile_df['Silencing Ratio(log)'].apply(get_silencing_status)
silencing_compile_df = silencing_compile_df.reset_index()
# Merge the "Silencing Ratio(log)" column from silencing_compile_df to verified_index
indexing_df = pd.merge(verified_index, silencing_compile_df[['index', 'Silencing Ratio(log)', 'Silencing']], on='index')
indexing_df['Drug Number'] = num_drugs_passed_filter.tolist()
indexing_df['Drug Annotation'] = num_drugs_passed_filter_with_drugs.tolist()
# Function to determine the 'Type' based on the conditions
def determine_type(row):
ncbi_number = row['NCBI']
if ncbi_number in Core_Genes:
return 'Core'
for category, values in category_dfs_select.items():
if ncbi_number in values:
return category
for drug, values in peripheral_dfs_select.items():
if ncbi_number in values:
return drug
if ncbi_number in multidrug_genes:
return 'Multidrug'
if ncbi_number in non_respondent_genes:
return 'Non-respondent'
return None # If not found in any of the sets
# Apply the function to create the 'Type' column
indexing_df['Type'] = indexing_df.apply(determine_type, axis=1)
# Add GO Count and Pubmed Count
GO_Count = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/Addition/01_gene_numgo_exp.xlsx")
Pudmed_Count = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/Addition//01_gene_numpub.xlsx")
# GO_Count.rename(columns={'Entrez ID': 'NCBI', 'Symbol': 'Gene_Symbol', '# exp GO': 'GO Count'}, inplace=True)
# Pudmed_Count.rename(columns={'Entrez ID': 'NCBI', 'Symbol': 'Gene_Symbol', '# Pubmed': 'Pubmed Count'}, inplace=True)
indexing_df = indexing_df.merge(GO_Count[['Entrez ID', '# exp GO']], left_on='NCBI', right_on='Entrez ID', how='left')
indexing_df.rename(columns={'# exp GO': 'GO Count'}, inplace=True)
indexing_df.drop('Entrez ID', axis=1, inplace=True)
indexing_df = indexing_df.merge(Pudmed_Count[['Entrez ID', '# Pubmed']], left_on='NCBI', right_on='Entrez ID', how='left')
indexing_df.rename(columns={'# Pubmed': 'Pubmed Count'}, inplace=True)
indexing_df.drop('Entrez ID', axis=1, inplace=True)
# Prepare FC and pval data
Final_pval_data = Final_pval.iloc[:, 4:]
Final_pval_data = Final_pval_data.set_index(batch_compile_stat_df.index)
# Remove columns containing 'nan_filter'
Final_pval_data = Final_pval_data.drop(columns=[col for col in Final_pval_data.columns if 'nan_filter' in col])
# Fill NaN values in 'FC' columns with 0
fc_columns = [col for col in Final_pval_data.columns if 'FC' in col]
Final_pval_data[fc_columns] = Final_pval_data[fc_columns].fillna(0)
# Replace NaN values in 'ovp3' columns with 1
ovp3_columns = [col for col in Final_pval_data.columns if 'ovp3' in col]
Final_pval_data[ovp3_columns] = Final_pval_data[ovp3_columns].fillna(1)
# Replace 'ovp3' with 'pval' in column names
Final_pval_data.columns = [col.replace('ovp3', 'pval') for col in Final_pval_data.columns]
# List to store the ordered column names
ordered_columns = []
ordered_drug_names = []
# Desired order of drug categories
desired_order = ["Antimetabolite", "DNA cross linking agent", "DNA strand break agent", "Microtubule inhibitor"]
# Iterate through the desired order of drug categories
for category in desired_order:
drugs_in_category = drug_category.get(category, [])
for drug in drugs_in_category:
# Add the FC and pval columns for each drug
ordered_columns.extend([f"FC_{drug}", f"pval_{drug}"])
# Add the drug to the ordered list
ordered_drug_names.append(drug)
# Reorder the columns in the DataFrame and reset the index
Final_pval_data = Final_pval_data[ordered_columns]
Final_pval_data = Final_pval_data.reset_index()
# Final merging of both indexing dataframe and data
Final_pval_df = pd.merge(indexing_df, Final_pval_data, on='index')
Final_pval_df.set_index('index', inplace=True)
Final_pval_df.index.name = None
# Saving file
Final_pval_df_path = os.path.join(database_files_stats, f"{directory}_Final_meanFC_pval.xlsx")
Final_pval_df.to_excel(Final_pval_df_path, index=False)
InĀ [108]:
Resistance_Genes = ['TCF21', 'MXD3', 'BHLHA9', 'JUNB', 'ESX1', 'LHX5', 'HMX2', 'MKRN1', 'SIRT1', 'RPF1', 'BCL2L2', 'BCL2', 'CASP4', 'MTCH1', 'TYMS', 'TYMP', 'UCK2']
Sensitizing_Genes = ['MYC', 'YAF2', 'E2F1', 'TFDP1', 'TEF', 'RNF7', 'SIX3', 'CDK2', 'CDK4', 'CDK6', 'FNTA', 'FGF20', 'FGF5', 'EREG', 'PDXK', 'DEDD2', 'EDAR', 'BCL2L15', 'BAD', 'UBTD1', 'SLC28A1', 'SLC28A2', 'SLC29A3', 'SLC29A4', 'TK1', 'YRDC']
InĀ [110]:
from adjustText import adjust_text
# Calculate the number of rows and columns for subplots
num_drugs = len(ordered_drug_names)
num_cols = 4 # Number of columns for subplots
num_rows = (num_drugs + num_cols - 1) // num_cols # Calculate the number of rows
# Create subplots
plt.style.use('default')
fig, axes = plt.subplots(num_rows, num_cols, figsize=(8*num_rows, 6*num_rows), facecolor='white')
# Plot the log2 fold change values against the rank index for each drug
for i, drug in enumerate(ordered_drug_names):
# Calculate the subplot index
row_idx = i // num_cols
col_idx = i % num_cols
# Sort the DataFrame based on the fold change (FC) for the current drug
sorted_df = Final_pval_df.sort_values(by='FC_' + drug, ascending=False)
# Assign ranks starting from 0 to the highest fold change
sorted_df['Rank'] = range(len(sorted_df))
# Plot bar plot for fold change against rank index with color from drug_color_map
# axes[row_idx, col_idx].bar(sorted_df['Rank'], sorted_df['FC_' + drug], label=drug, color=drug_color_map.get(drug, 'skyblue'), edgecolor='none')
axes[row_idx, col_idx].plot(sorted_df['Rank'], sorted_df['FC_' + drug], label=drug, color=drug_color_map.get(drug, 'skyblue'))
axes[row_idx, col_idx].set_title(drug)
axes[row_idx, col_idx].set_xlabel('Rank Index')
axes[row_idx, col_idx].set_ylabel('Fold Change')
axes[row_idx, col_idx].legend()
axes[row_idx, col_idx].grid(False)
axes[row_idx, col_idx].set_xlim(right=15000)
highlighted_genes = [] # To store genes that are highlighted
for gene in sorted_df['Gene_Symbol']:
if gene in Resistance_Genes:
resistance_gene = sorted_df[(sorted_df['Gene_Symbol'] == gene) & ((sorted_df['FC_' + drug] > 1) | (sorted_df['FC_' + drug] < -1)) & (sorted_df['pval_' + drug] < 0.05)]
if not resistance_gene.empty:
x = resistance_gene['Rank'].iloc[0]
y = resistance_gene['FC_' + drug].iloc[0]
highlighted_genes.append((x, y, gene, 'red')) # Add highlighted gene info
elif gene in Sensitizing_Genes:
sensitizing_gene = sorted_df[(sorted_df['Gene_Symbol'] == gene) & ((sorted_df['FC_' + drug] > 1) | (sorted_df['FC_' + drug] < -1)) & (sorted_df['pval_' + drug] < 0.05)]
if not sensitizing_gene.empty:
x = sensitizing_gene['Rank'].iloc[0]
y = sensitizing_gene['FC_' + drug].iloc[0]
highlighted_genes.append((x, y, gene, 'blue')) # Add highlighted gene info
# Add annotations with adjust_text to avoid overlap
texts = [axes[row_idx, col_idx].text(x, y, gene, fontsize=15, color=color) for x, y, gene, color in highlighted_genes]
adjust_text(texts, ax=axes[row_idx, col_idx], arrowprops=dict(arrowstyle='-', color='grey', lw=1.0))
# Highlight corresponding dots in red or blue with size 8
for x, y, _, color in highlighted_genes:
axes[row_idx, col_idx].scatter(x, y, color=color, s=10)
# Adjust layout
# plt.style.use('default')
plt.tight_layout()
plt.show()
InĀ [111]:
# Calculate the number of rows and columns for subplots
num_drugs = len(ordered_drug_names)
num_cols = 4 # Number of columns for subplots
num_rows = (num_drugs + num_cols - 1) // num_cols # Calculate the number of rows
# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(8*num_rows, 6*num_rows))
# Plot the volcano plots for each drug
for i, drug in enumerate(ordered_drug_names):
# Calculate the subplot index
row_idx = i // num_cols
col_idx = i % num_cols
# Sort the DataFrame based on the fold change (FC) for the current drug
sorted_df = Final_pval_df.sort_values(by='FC_' + drug, ascending=False)
# Calculate -log10(p-value)
sorted_df['-log10(p-value)'] = -np.log10(sorted_df['pval_' + drug])
# Plot volcano plot
axes[row_idx, col_idx].scatter(sorted_df['FC_' + drug], sorted_df['-log10(p-value)'], color='lightgrey', alpha=0.5, s=5)
axes[row_idx, col_idx].set_title(drug)
axes[row_idx, col_idx].set_xlabel('Log2 Fold Change')
axes[row_idx, col_idx].set_ylabel('-log10(p-value)')
axes[row_idx, col_idx].grid(False)
# Highlight genes with arrows based on Log2FC and p-value criteria
highlighted_genes = [] # To store genes that are highlighted
for gene in sorted_df['Gene_Symbol']:
if gene in Resistance_Genes:
resistance_gene = sorted_df[(sorted_df['Gene_Symbol'] == gene) & ((sorted_df['FC_' + drug] > 1) | (sorted_df['FC_' + drug] < -1)) & (sorted_df['pval_' + drug] < 0.05)]
if not resistance_gene.empty:
x = resistance_gene['FC_' + drug].iloc[0]
y = resistance_gene['-log10(p-value)'].iloc[0]
highlighted_genes.append((x, y, gene, 'red')) # Add highlighted gene info
elif gene in Sensitizing_Genes:
sensitizing_gene = sorted_df[(sorted_df['Gene_Symbol'] == gene) & ((sorted_df['FC_' + drug] > 1) | (sorted_df['FC_' + drug] < -1)) & (sorted_df['pval_' + drug] < 0.05)]
if not sensitizing_gene.empty:
x = sensitizing_gene['FC_' + drug].iloc[0]
y = sensitizing_gene['-log10(p-value)'].iloc[0]
highlighted_genes.append((x, y, gene, 'blue')) # Add highlighted gene info
# Add annotations with adjust_text to avoid overlap
texts = [axes[row_idx, col_idx].text(x, y, gene, fontsize=15, color=color) for x, y, gene, color in highlighted_genes]
adjust_text(texts, ax=axes[row_idx, col_idx], arrowprops=dict(arrowstyle='-', color='grey', lw=0.5))
# Highlight corresponding dots in red or blue with size 8
for x, y, _, color in highlighted_genes:
axes[row_idx, col_idx].scatter(x, y, color=color, s=10)
# Adjust layout
plt.tight_layout()
plt.show()
T Score calculation for High Saturaion Re-Testing¶
InĀ [112]:
# Filter out numpy warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
# Function to assign bin numbers based on T-score ranges
def assign_t_bin(t_score):
t_score_ranges = {
(4.0, float('inf')): 1,
(3.3, 4.0): 2,
(2.5, 3.3): 3,
(2.0, 2.5): 4,
(-2.5, 2.0): 5,
(-3.3, -2.5): 6,
(-4.0, -3.3): 7,
(-float('inf'), -4.0): 8
}
for score_range, bin_number in t_score_ranges.items():
if score_range[0] < t_score <= score_range[1]:
return bin_number
return 0
# Create a new DataFrame to store T scores
t_score_df = pd.DataFrame()
t_score_df['Verified'] = Final_pval_df['Verified']
# Initialize an empty list to store T value DataFrames for each drug
t_value_dfs = []
control_samples, experimental_samples = separate_batches(updated_divided_triplets)
# Print the results
for i, (control_sample_name, control_sample_data), (experimental_sample_name, experimental_sample_data) in zip(
range(1, len(control_samples) + 1), control_samples, experimental_samples):
for exp_sample in experimental_sample_data:
drug_name = extract_name(exp_sample)
# Access the DataFrame using the individual column names
tested_columns = np.array(batch_compile_stat_df[control_sample_data + exp_sample])
tested_columns = np.array([[float(val) if val != 'nan' else np.nan for val in row] for row in tested_columns],
dtype=float)
[row_num, col_num] = tested_columns.shape
treat_num = len(exp_sample)
control_num = len(control_sample_data)
t = np.zeros([row_num, Iter_num])
w = np.zeros([row_num, Iter_num])
mdiff = np.zeros([row_num, Iter_num])
wz = np.zeros([row_num, Iter_num])
x = tested_columns[:, control_num:] # treatment group
y = tested_columns[:, :control_num] # control group
[t, w, mdiff, wz] = stat_uneq(x, y)
# Create a DataFrame for the T values for this drug
t_value_df = pd.DataFrame({f'T_value_{drug_name}': t}, index=batch_compile_stat_df.index)
# Apply the assign_t_bin function to create the 'Bin' column for this drug
t_value_df[f'Bin_{drug_name}'] = t_value_df[f'T_value_{drug_name}'].apply(assign_t_bin)
# Check if 'Verified' is equal to 0
t_value_df[f'Bin_{drug_name}'] = np.where(t_score_df['Verified'] == 0, 0, t_value_df[f'Bin_{drug_name}'])
# Append the DataFrame to the list of T value DataFrames
t_value_dfs.append(t_value_df)
# Concatenate the list of T value DataFrames into a single DataFrame
t_score_df = pd.concat([t_score_df] + t_value_dfs, axis=1)
t_score_df = t_score_df.drop(['Verified'], axis =1)
# Initialize the "Other" column as an empty string
t_score_df['Other'] = ''
# Loop through each row to calculate the category for each gene
for index, row in t_score_df.iterrows():
all_below_threshold = True
# Loop through the drug names and check their T_value
for drug_name in drug_name_list:
t_value_column = f'T_value_{drug_name}'
t_value = row[t_value_column]
if not pd.isna(t_value) and abs(t_value) >= 2:
all_below_threshold = False
break # Exit the loop if any T_value is above or equal to 2
if all_below_threshold:
t_score_df.at[index, 'Other'] = 'Control'
else:
counts = {
'Peripheral': 0,
'Core': 0,
}
# Loop through the drug names and check their T_value
for drug_name in drug_name_list:
t_value_column = f'T_value_{drug_name}'
t_value = row[t_value_column]
if not pd.isna(t_value):
if abs(t_value) > 4:
counts['Peripheral'] += 1
if abs(t_value) > 2:
counts['Core'] += 1
if counts['Core'] >= 8:
t_score_df.at[index, 'Other'] = 'Core'
elif counts['Peripheral'] >= 1:
t_score_df.at[index, 'Other'] = 'Peripheral'
# After the code, you can reset the warnings to their default behavior
Final_pval_Index = Final_pval_df.iloc[:, :7]
Final_pval_Index = Final_pval_Index.reset_index()
t_score_df = t_score_df.reset_index()
t_score_compiled_df = pd.merge(Final_pval_Index, t_score_df, on = 'index')
t_score_compiled_df.set_index('index', inplace =True)
t_score_compiled_df.index.name = None
t_score_compiled_path = os.path.join(database_files_stats, f"{directory}_T_score.xlsx")
t_score_compiled_df.to_excel(t_score_compiled_path, index=False)
warnings.resetwarnings()
InĀ [113]:
t_testing_path = os.path.join(database_files_stats, f"{directory}_T_score_testing.xlsx")
t_testing_df = pd.read_excel(t_testing_path)
InĀ [114]:
# Create a new column to store the match result
t_testing_df['Match'] = False
# Iterate through each row in the DataFrame
for index, row in t_testing_df.iterrows():
previous_drug = row['Previous Drug']
previous_number = row['Previous number']
other = row['Other']
for column_name in t_testing_df.columns:
if column_name.startswith('Bin_'):
drug_name = column_name.split('_')[1]
current_number = row[column_name]
# Check if the "Previous number" and "Bin number" are both within the specified ranges
if previous_drug == drug_name and (1 <= previous_number <= 4) and (1 <= current_number <= 4):
# Both in the positive range, set 'Match' to True
t_testing_df.at[index, 'Match'] = True
elif previous_drug == drug_name and (5 <= previous_number <= 8) and (5 <= current_number <= 8):
# Both in the negative range, set 'Match' to True
t_testing_df.at[index, 'Match'] = True
if previous_drug == other:
t_testing_df.at[index, 'Match'] = True
t_testing_compiled_path = os.path.join(database_files_stats, f"{directory}_T_score_testing_match2.xlsx")
t_testing_df.to_excel(t_testing_compiled_path, index=False)
Z-Score evaluation¶
InĀ [115]:
# Prepare datafarme with only baseline and DMSO
finding_zscore = pd.read_excel(os.path.join(database_files_stats, f"{directory}_all_meanFC_allpval.xlsx"))
zscore_df = finding_zscore[baseline_dmso_columns]
# Prepare 4 columns division for the
zscore_divide = [baseline_dmso_columns[i:i + 4] for i in range(0, len(baseline_dmso_columns), 4)]
# Initialize a dictionary to store the differences for each batch
zscore_data_dict = {}
# Calculate the differences between Baseline and DMSO for each set
for i, set_columns in enumerate(zscore_divide):
baseline_column = set_columns[0]
dmso_columns = set_columns[1:]
# Calculate the mean for each DMSO group
mean_dmso = zscore_df[dmso_columns].mean(axis=1)
# Subtract the mean of DMSO from Baseline for each set
differences = mean_dmso - zscore_df[baseline_column]
# Store the differences for this batch
batch_name = f'Batch {i + 1}'
zscore_data_dict[batch_name] = differences
# Create a DataFrame from the differences dictionary
zscore_data_df = pd.DataFrame(zscore_data_dict)
# Convert log2FC to Z score by columns
zscore_data_df = Zscore(zscore_data_df)
# Calculate variance of Z scores for each batch
variances = zscore_data_df.var()
# Calculate weights as the inverse of the variances
weights = 1 / variances
# Normalize the weights to sum up to 1
weights_normalized = weights / sum(weights)
# Converting normalized weight to array
weights_normalized_array = weights_normalized.values
# Set up the Combining Z score methods
Chosen_combining_methods = "Z_score_transformation" # Change this to the desired method
# Assign following: "Z_score", "Meta-Analysis", "Stouffer",
# "Fisher", "Z_score_Average", "Z_score_transformation", "Inverse_variance"
if Chosen_combining_methods == "Z_score":
## Z score at each row - Null hypothesis testing between each gene's Z score of batches
# Calculate sum, mean, standard deviation for each genes (row)
mean_per_row = zscore_data_df.mean(axis=1)
std_per_row = zscore_data_df.std(axis=1)
sum_per_row = np.sum(zscore_data_df, axis=1)
# Combined Z score transformation for row
combined_Z_scores = (sum_per_row - mean_per_row) / std_per_row
elif Chosen_combining_methods == "Meta-Analysis":
## Meta-Analysis with Random Effects Model
# Change datafarme to arrays
z_score_val = zscore_data_df.values
# Calculate weighted Z-scores and total weight
weighted_z_scores = z_score_val * np.sqrt(1 / weights_normalized_array)
total_weight = np.sum(1 / weights_normalized_array)
combined_Z_scores = np.sum(weighted_z_scores, axis=1) / total_weight
elif Chosen_combining_methods == "Stouffer":
## Stouffer's Z-score method
# Initialize an array to store the combined Z scores for each sample
combined_Z_scores = np.zeros(zscore_data_df.shape[0])
# Iterate over each sample (row) and calculate the combined Z score
for i in range(zscore_data_df.shape[0]):
numerator = np.sum(zscore_data_df.iloc[i] * weights_normalized)
denominator = np.sqrt(np.sum(weights_normalized))
combined_Z_scores[i] = numerator / denominator
elif Chosen_combining_methods == "Z_score_Average":
## Combining Z Scores by Weighted Average
# Calculate the numerator: weighted sum of Z scores for each row
numerator = (zscore_data_df.values * weights_normalized_array).sum(axis=1)
# Calculate the denominator: square root of the sum of weights
denominator = np.sqrt(sum(weights))
# Calculate the combined Z score for each row using Stouffer's Z-score method
combined_Z_scores = numerator / denominator
elif Chosen_combining_methods == "Fisher":
# Fisher's method
from scipy.stats import norm, chi2
# Calculate the chi-squared statistic using Fisher's method
# Fisher's method: X^2 = -2 * sum(log(p_values))
# Since Z scores are used, we'll square them to get the p-values (two-tailed test)
# p_values = 2 * norm.cdf(-abs(Z_scores))
pscore_data_df = zscore_data_df.copy()
p_values = 2 * norm.cdf(-np.abs(pscore_data_df))
# Calculate the chi-squared statistic using Fisher's method
combined_chi_squared = -2 * np.sum(np.log(p_values), axis=1)
# Degrees of freedom = 2 * number of batches
df = 2 * pscore_data_df.shape[1]
# Calculate the p-value using the chi-squared distribution
combined_p_value = 1 - chi2.cdf(combined_chi_squared, df)
elif Chosen_combining_methods == "Z_score_transformation":
## Z Score Transformation followed by Averaging
# Calculate mean and standard deviation for each row
mean_per_row = zscore_data_df.mean(axis=1)
std_per_row = zscore_data_df.std(axis=1)
# Inverse Z Score Transformation
# Calculate the inverse Z score transformation for each column
inverse_transformed_values = zscore_data_df.apply(lambda col: col * std_per_row + mean_per_row , axis=0)
# Calculate the combined Z score for each row by averaging the inverse transformed values
combined_Z_scores = inverse_transformed_values.mean(axis=1)
elif Chosen_combining_methods == "Inverse_variance":
## Inverse Variance Weighting
# Assume standard error (SE) for Z scores is 1
SE = np.ones_like(zscore_data_df.values)
# Calculate the inverse variance weights
inverse_variance_weights = 1 / (SE ** 2)
# Calculate the numerator and denominator for the combined Z score
numerator = np.sum(zscore_data_df / SE, axis=1)
denominator = np.sum(inverse_variance_weights, axis=1)
# Calculate the combined Z score for each row
combined_Z_scores_inverse_variance = numerator / denominator
combined_Z_scores = combined_Z_scores_inverse_variance
else:
print("The combined Z method is not set")
# Align Calculated Combined Z score into dataframe
zscore_data_df['Combined_Z_Score'] = combined_Z_scores
# Assign list for headers
all_header = zscore_data_df.columns.to_list()
# # If any of batches have NaN value, change combined Z score to NaN
# zscore_data_df.loc[zscore_data_df[all_header].isnull().any(axis=1), 'Combined_Z_Score'] = np.nan
# Saving initial Z score dataframe
zscore_data_path = os.path.join(database_files_stats, f"{directory}_zscore.xlsx")
finding_zscore_index = finding_zscore.iloc[:, :4]
zscore_data_compile_df = pd.concat([finding_zscore_index, zscore_data_df], axis=1)
zscore_data_compile_df.to_excel(zscore_data_path, index=False)
# zscore_data_compile_df_sorted = zscore_data_compile_df.sort_values(
# by='Combined_Z_Score',
# ascending=False)
# zscore_data_compile_df = save_dataframe(fasta_data, zscore_data_df, zscore_data_path)
# Identify batch columns
batch_cols = [col for col in zscore_data_compile_df.columns if col.startswith('Batch')]
# Fill any remaining NaNs in Combined_Z_Score with the first available non-NaN from batch_cols
zscore_data_compile_df['Combined_Z_Score'] = zscore_data_compile_df.apply(
lambda row: next((row[col] for col in batch_cols if not pd.isna(row[col])), np.nan)
if pd.isna(row['Combined_Z_Score']) else row['Combined_Z_Score'],
axis=1
)
# Count non-NaN batch values per row
zscore_data_compile_df['NonNaN_Batch_Count'] = zscore_data_compile_df[batch_cols].notna().sum(axis=1)
# Sort by number of valid batches descending, then Combined_Z_Score descending
zscore_data_compile_df_sorted = zscore_data_compile_df.sort_values(
by=['NonNaN_Batch_Count', 'Combined_Z_Score'],
ascending=[False, False]
)
zscore_data_compile_df_sorted_negative = zscore_data_compile_df.sort_values(
by=['NonNaN_Batch_Count', 'Combined_Z_Score'],
ascending=[False, True]
)
# Drop helper column
zscore_data_compile_df_sorted = zscore_data_compile_df_sorted.drop(columns=['NonNaN_Batch_Count'])
zscore_data_compile_df_sorted_negative = zscore_data_compile_df_sorted_negative.drop(columns=['NonNaN_Batch_Count'])
PR Curve¶
InĀ [116]:
# Induce dataframe from TCGA
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#favorable', '#unfavorable']], left_on='NCBI', right_on='Entrez', how='left')
# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)
# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()
# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#favorable', '#unfavorable', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')
# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)
# Replace NaN values in #favorable_x with values from #Cervical_Favor_y if NaN
zscore_merge_df['#favorable_x'] = zscore_merge_df['#favorable_x'].fillna(zscore_merge_df['#favorable_y'])
# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#unfavorable_x'] = zscore_merge_df['#unfavorable_x'].fillna(zscore_merge_df['#unfavorable_y'])
# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#favorable_y', '#unfavorable_y'], axis=1, inplace=True)
# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#favorable_x': '#favorable', '#unfavorable_x': '#unfavorable'}, inplace=True)
# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#unfavorable'] >= 1 else 0, axis=1)
# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()
# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
zscore_merge_df['Precision'] = 0
zscore_merge_df['Recall'] = 0
zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
zscore_merge_df['Rank'] = None
zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)
# Initialize variables
precision_values = []
recall_values = []
precision_cumulative_hit = 0
recall_cumulative_hit = 0
hit_count = zscore_merge_df['Hit'].sum()
# Compute Precision and Recall
for idx, row in zscore_merge_df.iterrows():
if idx < first_hit_index:
precision_values.append(None)
recall_values.append(None)
continue
precision_cumulative_hit += row['Hit']
recall_cumulative_hit += row['Hit']
precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
recall = recall_cumulative_hit / hit_count * 100
precision_values.append(precision)
recall_values.append(recall)
zscore_merge_df['Precision'] = precision_values
zscore_merge_df['Recall'] = recall_values
# Calculate cumulative max of precision
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_Unfavor_old.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)
Old_HPA_unfavorable_df = zscore_merge_df.copy()
# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']
# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()
# Print F1 score where precision ā recall
print(f"F1 score where Precision ā Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")
# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))
color_prec = '#dc143cff'
color_recall = '#003e98ff'
# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA Unfavour PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()
# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')
# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')
# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_old_HPA_Unfavor_PR.svg")
plt.savefig(PR_path, format='svg', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")
plt.show()
F1 score where Precision ā Recall: 36.92 Corresponding Rank: 5249 RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_old_HPA_Unfavor_PR.svg
InĀ [117]:
Old_HPA_unfavorable_df
Out[117]:
| ORF_ID | NCBI | Group | Gene_Symbol | Batch 1 | Batch 2 | Batch 3 | Batch 4 | Batch 5 | Combined_Z_Score | #favorable | #unfavorable | Hit | Rank | Precision | Recall | Max(Precision) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 53066 | 7691 | G04 | ZNF132 | -1.351889 | 5.791197 | 4.831242 | 1.339717 | 1.311387 | 9.314892 | 2.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 1 | 100000022 | 84133 | G08 | ZNRF3 | 2.472490 | -1.661915 | 4.844651 | 4.483725 | 1.916883 | 8.681986 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 2 | 100010472 | 6051 | G04 | RNPEP | 3.737812 | 1.740719 | 6.325549 | -1.607528 | -0.132890 | 8.326448 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 3 | 5102 | 55214 | G05 | LEPREL1 | -0.889157 | 0.244954 | 4.972440 | 4.312032 | 1.626191 | 7.261316 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 4 | 9924 | 22795 | G06 | NID2 | 2.166479 | -0.034532 | 5.174375 | 3.790967 | -0.653718 | 7.253175 | 0.0 | 2.0 | 1 | 1 | 100.000000 | 0.019238 | 100.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14525 | 53867 | 5290 | G06 | PIK3CA | 0.109405 | NaN | NaN | NaN | NaN | 0.109405 | 0.0 | 0.0 | 0 | 14522 | 35.780196 | 99.961524 | 35.786575 |
| 14526 | 10422 | 7786 | G02 | MAP3K12 | -0.360605 | NaN | NaN | NaN | NaN | -0.360605 | 1.0 | 1.0 | 1 | 14523 | 35.784618 | 99.980762 | 35.786575 |
| 14527 | 71971 | 3747 | G08 | KCNC2 | NaN | NaN | NaN | NaN | -1.099003 | -1.099003 | 0.0 | 0.0 | 0 | 14524 | 35.782154 | 99.980762 | 35.786575 |
| 14528 | 7993 | 6170 | G02 | RPL39 | NaN | NaN | NaN | NaN | -2.406590 | -2.406590 | 0.0 | 1.0 | 1 | 14525 | 35.786575 | 100.000000 | 35.786575 |
| 14529 | 100015186 | 84435 | G08 | GPR123 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0 | 14526 | 35.784111 | 100.000000 | 35.784111 |
14530 rows Ć 17 columns
InĀ [118]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#favorable', '#unfavorable']], left_on='NCBI', right_on='Entrez', how='left')
# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)
# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()
# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#favorable', '#unfavorable', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')
# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)
# Replace NaN values in #favorable_x with values from #Cervical_Favor_y if NaN
zscore_merge_df['#favorable_x'] = zscore_merge_df['#favorable_x'].fillna(zscore_merge_df['#favorable_y'])
# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#unfavorable_x'] = zscore_merge_df['#unfavorable_x'].fillna(zscore_merge_df['#unfavorable_y'])
# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#favorable_y', '#unfavorable_y'], axis=1, inplace=True)
# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#favorable_x': '#favorable', '#unfavorable_x': '#unfavorable'}, inplace=True)
# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#unfavorable'] >= 1 else 0, axis=1)
# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()
# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
zscore_merge_df['Precision'] = 0
zscore_merge_df['Recall'] = 0
zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
zscore_merge_df['Rank'] = None
zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)
# Initialize variables
precision_values = []
recall_values = []
precision_cumulative_hit = 0
recall_cumulative_hit = 0
hit_count = zscore_merge_df['Hit'].sum()
# Compute Precision and Recall
for idx, row in zscore_merge_df.iterrows():
if idx < first_hit_index:
precision_values.append(None)
recall_values.append(None)
continue
precision_cumulative_hit += row['Hit']
recall_cumulative_hit += row['Hit']
precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
recall = recall_cumulative_hit / hit_count * 100
precision_values.append(precision)
recall_values.append(recall)
zscore_merge_df['Precision'] = precision_values
zscore_merge_df['Recall'] = recall_values
# Calculate cumulative max of precision
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_Unfavor_new.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)
New_HPA_unfavorable_df = zscore_merge_df.copy()
# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']
# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()
# Print F1 score where precision ā recall
print(f"F1 score where Precision ā Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")
# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))
color_prec = '#dc143cff'
color_recall = '#003e98ff'
# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA Unfavor PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()
# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')
# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')
# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_new_HPA_Unfavor.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")
plt.show()
F1 score where Precision ā Recall: 42.74 Corresponding Rank: 6080 RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_new_HPA_Unfavor.svg
InĀ [119]:
New_HPA_unfavorable_df
Out[119]:
| ORF_ID | NCBI | Group | Gene_Symbol | Batch 1 | Batch 2 | Batch 3 | Batch 4 | Batch 5 | Combined_Z_Score | #favorable | #unfavorable | Hit | Rank | Precision | Recall | Max(Precision) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 53066 | 7691 | G04 | ZNF132 | -1.351889 | 5.791197 | 4.831242 | 1.339717 | 1.311387 | 9.314892 | 1.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 1 | 100000022 | 84133 | G08 | ZNRF3 | 2.472490 | -1.661915 | 4.844651 | 4.483725 | 1.916883 | 8.681986 | 2.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 2 | 100010472 | 6051 | G04 | RNPEP | 3.737812 | 1.740719 | 6.325549 | -1.607528 | -0.132890 | 8.326448 | 1.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 3 | 5102 | 55214 | G05 | LEPREL1 | -0.889157 | 0.244954 | 4.972440 | 4.312032 | 1.626191 | 7.261316 | 0.0 | 1.0 | 1 | 1 | 100.000000 | 0.016472 | 100.000000 |
| 4 | 9924 | 22795 | G06 | NID2 | 2.166479 | -0.034532 | 5.174375 | 3.790967 | -0.653718 | 7.253175 | 0.0 | 2.0 | 1 | 2 | 100.000000 | 0.032944 | 100.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14502 | 53867 | 5290 | G06 | PIK3CA | 0.109405 | NaN | NaN | NaN | NaN | 0.109405 | 1.0 | 0.0 | 0 | 14500 | 41.855172 | 99.967056 | 41.860305 |
| 14503 | 10422 | 7786 | G02 | MAP3K12 | -0.360605 | NaN | NaN | NaN | NaN | -0.360605 | 1.0 | 1.0 | 1 | 14501 | 41.859182 | 99.983528 | 41.860305 |
| 14504 | 71971 | 3747 | G08 | KCNC2 | NaN | NaN | NaN | NaN | -1.099003 | -1.099003 | 0.0 | 0.0 | 0 | 14502 | 41.856296 | 99.983528 | 41.860305 |
| 14505 | 7993 | 6170 | G02 | RPL39 | NaN | NaN | NaN | NaN | -2.406590 | -2.406590 | 1.0 | 3.0 | 1 | 14503 | 41.860305 | 100.000000 | 41.860305 |
| 14506 | 100015186 | 84435 | G08 | GPR123 | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | 0 | 14504 | 41.857419 | 100.000000 | 41.857419 |
14507 rows Ć 17 columns
InĀ [120]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)
# zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#favorable', '#unfavorable']], left_on='NCBI', right_on='Entrez', how='left')
# zscore_merge_df = zscore_merge_df.sort_values(by=['Combined_Z_Score'], ascending=True)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted_negative, TCGA_data[['Entrez', '#favorable', '#unfavorable']], left_on='NCBI', right_on='Entrez', how='left')
# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)
# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()
# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#favorable', '#unfavorable', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')
# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)
# Replace NaN values in #favorable_x with values from #Cervical_Favor_y if NaN
zscore_merge_df['#favorable_x'] = zscore_merge_df['#favorable_x'].fillna(zscore_merge_df['#favorable_y'])
# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#unfavorable_x'] = zscore_merge_df['#unfavorable_x'].fillna(zscore_merge_df['#unfavorable_y'])
# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#favorable_y', '#unfavorable_y'], axis=1, inplace=True)
# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#favorable_x': '#favorable', '#unfavorable_x': '#unfavorable'}, inplace=True)
# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#favorable'] >= 1 else 0, axis=1)
# zscore_merge_df = zscore_merge_df.iloc[2:, :]
# Calculate the total number of rows in the DataFrame
total_rows = len(zscore_merge_df)
# # Add a new column 'Rank' with values from 1 to total_rows
# zscore_merge_df['Rank'] = range(1, total_rows + 1)
# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()
# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
zscore_merge_df['Precision'] = 0
zscore_merge_df['Recall'] = 0
zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
zscore_merge_df['Rank'] = None
zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)
# Initialize variables
precision_values = []
recall_values = []
precision_cumulative_hit = 0
recall_cumulative_hit = 0
hit_count = zscore_merge_df['Hit'].sum()
# Compute Precision and Recall
for idx, row in zscore_merge_df.iterrows():
if idx < first_hit_index:
precision_values.append(None)
recall_values.append(None)
continue
precision_cumulative_hit += row['Hit']
recall_cumulative_hit += row['Hit']
precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
recall = recall_cumulative_hit / hit_count * 100
precision_values.append(precision)
recall_values.append(recall)
zscore_merge_df['Precision'] = precision_values
zscore_merge_df['Recall'] = recall_values
# Calculate cumulative max of precision
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_favor.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)
HPA_new_favorable_df = zscore_merge_df.copy()
# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']
# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()
# Print F1 score where precision ā recall
print(f"F1 score where Precision ā Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")
# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))
color_prec = '#dc143cff'
color_recall = '#003e98ff'
# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA favorable PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()
# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')
# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')
# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_new_HPA_favorable_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")
plt.show()
F1 score where Precision ā Recall: 46.48 Corresponding Rank: 6991 RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_new_HPA_favorable_PR.svg
InĀ [121]:
HPA_new_favorable_df
Out[121]:
| ORF_ID | NCBI | Group | Gene_Symbol | Batch 1 | Batch 2 | Batch 3 | Batch 4 | Batch 5 | Combined_Z_Score | #favorable | #unfavorable | Hit | Rank | Precision | Recall | Max(Precision) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100067281 | 2294 | G09 | FOXF1 | -4.020519 | -6.371358 | -0.996546 | -3.861530 | -0.269732 | -10.795805 | 2.0 | 0.0 | 1 | 1 | 100.000000 | 0.014830 | 100.000000 |
| 1 | 5274 | 3142 | G04 | HLX | -3.900398 | -0.470447 | -0.633137 | -2.842160 | -6.190604 | -9.514313 | 2.0 | 0.0 | 1 | 2 | 100.000000 | 0.029660 | 100.000000 |
| 2 | 52970 | 79755 | G02 | ZNF750 | -5.630888 | -3.813437 | 0.297178 | -2.508267 | 0.559808 | -8.123800 | 1.0 | 1.0 | 1 | 3 | 100.000000 | 0.044491 | 100.000000 |
| 3 | 5068 | 5452 | G05 | POU2F2 | -1.243005 | -4.496343 | -1.603274 | 0.259306 | -5.186843 | -8.102555 | 0.0 | 0.0 | 0 | 4 | 75.000000 | 0.044491 | 75.000000 |
| 4 | 100080502 | 342371 | delta | ATXN1L | -5.561001 | -1.584375 | -2.546113 | 0.103948 | -2.041524 | -7.127541 | 0.0 | 0.0 | 0 | 5 | 60.000000 | 0.044491 | 60.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14502 | 100068241 | 57482 | G01 | KIAA1211 | NaN | 0.550111 | NaN | NaN | NaN | 0.550111 | 1.0 | 0.0 | 1 | 14503 | 46.473143 | 99.955509 | 46.481009 |
| 14503 | 100000034 | 23774 | G08 | BRD1 | NaN | 1.311774 | NaN | NaN | NaN | 1.311774 | 2.0 | 0.0 | 1 | 14504 | 46.476834 | 99.970340 | 46.481009 |
| 14504 | 100080188 | 131149 | delta | OTOL1 | NaN | 1.602334 | NaN | NaN | NaN | 1.602334 | 0.0 | 0.0 | 0 | 14505 | 46.473630 | 99.970340 | 46.481009 |
| 14505 | 1027 | 4753 | G01 | NELL2 | NaN | 2.654557 | NaN | NaN | NaN | 2.654557 | 1.0 | 0.0 | 1 | 14506 | 46.477320 | 99.985170 | 46.481009 |
| 14506 | 100015186 | 84435 | G08 | GPR123 | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | 1 | 14507 | 46.481009 | 100.000000 | 46.481009 |
14507 rows Ć 17 columns
InĀ [122]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor']], left_on='NCBI', right_on='Entrez', how='left')
# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)
# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()
# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')
# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)
# Replace NaN values in #favorable_x with values from #favorable_y if NaN
zscore_merge_df['#Cervical_Favor_x'] = zscore_merge_df['#Cervical_Favor_x'].fillna(zscore_merge_df['#Cervical_Favor_y'])
# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#Cervical_Unfavor_x'] = zscore_merge_df['#Cervical_Unfavor_x'].fillna(zscore_merge_df['#Cervical_Unfavor_y'])
# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#Cervical_Favor_y', '#Cervical_Unfavor_y'], axis=1, inplace=True)
# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#Cervical_Favor_x': '#Cervical_Favor', '#Cervical_Unfavor_x': '#Cervical_Unfavor'}, inplace=True)
# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#Cervical_Unfavor'] >= 1 else 0, axis=1)
# zscore_merge_df = zscore_merge_df.iloc[2:, :]
# Calculate the total number of rows in the DataFrame
total_rows = len(zscore_merge_df)
# # Add a new column 'Rank' with values from 1 to total_rows
# zscore_merge_df['Rank'] = range(1, total_rows + 1)
# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()
# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
zscore_merge_df['Precision'] = 0
zscore_merge_df['Recall'] = 0
zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
zscore_merge_df['Rank'] = None
zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)
# Initialize variables
precision_values = []
recall_values = []
precision_cumulative_hit = 0
recall_cumulative_hit = 0
hit_count = zscore_merge_df['Hit'].sum()
# Compute Precision and Recall
for idx, row in zscore_merge_df.iterrows():
if idx < first_hit_index:
precision_values.append(None)
recall_values.append(None)
continue
precision_cumulative_hit += row['Hit']
recall_cumulative_hit += row['Hit']
precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
recall = recall_cumulative_hit / hit_count * 100
precision_values.append(precision)
recall_values.append(recall)
zscore_merge_df['Precision'] = precision_values
zscore_merge_df['Recall'] = recall_values
# Calculate cumulative max of precision
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_Cervical_Unfavor.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)
HPA_Cervical_Unfavor_df = zscore_merge_df.copy()
# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']
# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()
# Print F1 score where precision ā recall
print(f"F1 score where Precision ā Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")
# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))
color_prec = '#dc143cff'
color_recall = '#003e98ff'
# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA Cervical Unfavorable PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()
# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')
# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')
# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_HPA_cervical_Unfavorable_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")
plt.show()
F1 score where Precision ā Recall: 2.61 Corresponding Rank: 396 RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_HPA_cervical_Unfavorable_PR.svg
InĀ [123]:
HPA_Cervical_Unfavor_df
Out[123]:
| ORF_ID | NCBI | Group | Gene_Symbol | Batch 1 | Batch 2 | Batch 3 | Batch 4 | Batch 5 | Combined_Z_Score | #Cervical_Favor | #Cervical_Unfavor | Hit | Rank | Precision | Recall | Max(Precision) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 53066 | 7691 | G04 | ZNF132 | -1.351889 | 5.791197 | 4.831242 | 1.339717 | 1.311387 | 9.314892 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 1 | 100000022 | 84133 | G08 | ZNRF3 | 2.472490 | -1.661915 | 4.844651 | 4.483725 | 1.916883 | 8.681986 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 2 | 100010472 | 6051 | G04 | RNPEP | 3.737812 | 1.740719 | 6.325549 | -1.607528 | -0.132890 | 8.326448 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 3 | 5102 | 55214 | G05 | LEPREL1 | -0.889157 | 0.244954 | 4.972440 | 4.312032 | 1.626191 | 7.261316 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 4 | 9924 | 22795 | G06 | NID2 | 2.166479 | -0.034532 | 5.174375 | 3.790967 | -0.653718 | 7.253175 | 0.0 | 1.0 | 1 | 1 | 100.000000 | 0.326797 | 100.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14502 | 53867 | 5290 | G06 | PIK3CA | 0.109405 | NaN | NaN | NaN | NaN | 0.109405 | 0.0 | 0.0 | 0 | 14499 | 2.110490 | 100.000000 | 2.110490 |
| 14503 | 10422 | 7786 | G02 | MAP3K12 | -0.360605 | NaN | NaN | NaN | NaN | -0.360605 | 0.0 | 0.0 | 0 | 14500 | 2.110345 | 100.000000 | 2.110345 |
| 14504 | 71971 | 3747 | G08 | KCNC2 | NaN | NaN | NaN | NaN | -1.099003 | -1.099003 | 0.0 | 0.0 | 0 | 14501 | 2.110199 | 100.000000 | 2.110199 |
| 14505 | 7993 | 6170 | G02 | RPL39 | NaN | NaN | NaN | NaN | -2.406590 | -2.406590 | 1.0 | 0.0 | 0 | 14502 | 2.110054 | 100.000000 | 2.110054 |
| 14506 | 100015186 | 84435 | G08 | GPR123 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0 | 14503 | 2.109908 | 100.000000 | 2.109908 |
14507 rows Ć 17 columns
InĀ [124]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)
# zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor']], left_on='NCBI', right_on='Entrez', how='left')
# zscore_merge_df = zscore_merge_df.sort_values(by=['Combined_Z_Score'], ascending=True)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted_negative, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor']], left_on='NCBI', right_on='Entrez', how='left')
# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)
# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()
# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', '#Cervical_Favor', '#Cervical_Unfavor', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')
# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)
# Replace NaN values in #favorable_x with values from #favorable_y if NaN
zscore_merge_df['#Cervical_Favor_x'] = zscore_merge_df['#Cervical_Favor_x'].fillna(zscore_merge_df['#Cervical_Favor_y'])
# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['#Cervical_Unfavor_x'] = zscore_merge_df['#Cervical_Unfavor_x'].fillna(zscore_merge_df['#Cervical_Unfavor_y'])
# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', '#Cervical_Favor_y', '#Cervical_Unfavor_y'], axis=1, inplace=True)
# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'#Cervical_Favor_x': '#Cervical_Favor', '#Cervical_Unfavor_x': '#Cervical_Unfavor'}, inplace=True)
# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['#Cervical_Favor'] >= 1 else 0, axis=1)
# zscore_merge_df = zscore_merge_df.sort_values(by=['Combined_Z_Score'], ascending=True)
# zscore_merge_df = zscore_merge_df.iloc[2:, :]
# Calculate the total number of rows in the DataFrame
total_rows = len(zscore_merge_df)
# # Add a new column 'Rank' with values from 1 to total_rows
# zscore_merge_df['Rank'] = range(1, total_rows + 1)
# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()
# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
zscore_merge_df['Precision'] = 0
zscore_merge_df['Recall'] = 0
zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
zscore_merge_df['Rank'] = None
zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)
# Initialize variables
precision_values = []
recall_values = []
precision_cumulative_hit = 0
recall_cumulative_hit = 0
hit_count = zscore_merge_df['Hit'].sum()
# Compute Precision and Recall
for idx, row in zscore_merge_df.iterrows():
if idx < first_hit_index:
precision_values.append(None)
recall_values.append(None)
continue
precision_cumulative_hit += row['Hit']
recall_cumulative_hit += row['Hit']
precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
recall = recall_cumulative_hit / hit_count * 100
precision_values.append(precision)
recall_values.append(recall)
zscore_merge_df['Precision'] = precision_values
zscore_merge_df['Recall'] = recall_values
# Calculate cumulative max of precision
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Calculate the cumulative maximum of the 'Precision' column from each row onwards
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Save final DataFrame
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_HPA_Cervical_favor.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)
HPA_Cervical_favor_df = zscore_merge_df.copy()
# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']
# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()
# Print F1 score where precision ā recall
print(f"F1 score where Precision ā Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")
# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))
color_prec = '#dc143cff'
color_recall = '#003e98ff'
# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('HPA Cervical favorable PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()
# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')
# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')
# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_HPA_Cervical_favorable_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")
plt.show()
F1 score where Precision ā Recall: 2.47 Corresponding Rank: 743 RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_HPA_Cervical_favorable_PR.svg
InĀ [125]:
HPA_Cervical_favor_df
Out[125]:
| ORF_ID | NCBI | Group | Gene_Symbol | Batch 1 | Batch 2 | Batch 3 | Batch 4 | Batch 5 | Combined_Z_Score | #Cervical_Favor | #Cervical_Unfavor | Hit | Rank | Precision | Recall | Max(Precision) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100067281 | 2294 | G09 | FOXF1 | -4.020519 | -6.371358 | -0.996546 | -3.861530 | -0.269732 | -10.795805 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 1 | 5274 | 3142 | G04 | HLX | -3.900398 | -0.470447 | -0.633137 | -2.842160 | -6.190604 | -9.514313 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 2 | 52970 | 79755 | G02 | ZNF750 | -5.630888 | -3.813437 | 0.297178 | -2.508267 | 0.559808 | -8.123800 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 3 | 5068 | 5452 | G05 | POU2F2 | -1.243005 | -4.496343 | -1.603274 | 0.259306 | -5.186843 | -8.102555 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| 4 | 100080502 | 342371 | delta | ATXN1L | -5.561001 | -1.584375 | -2.546113 | 0.103948 | -2.041524 | -7.127541 | 0.0 | 0.0 | 0 | None | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14502 | 100068241 | 57482 | G01 | KIAA1211 | NaN | 0.550111 | NaN | NaN | NaN | 0.550111 | 0.0 | 0.0 | 0 | 14480 | 2.182320 | 100.0 | 2.182320 |
| 14503 | 100000034 | 23774 | G08 | BRD1 | NaN | 1.311774 | NaN | NaN | NaN | 1.311774 | 0.0 | 0.0 | 0 | 14481 | 2.182170 | 100.0 | 2.182170 |
| 14504 | 100080188 | 131149 | delta | OTOL1 | NaN | 1.602334 | NaN | NaN | NaN | 1.602334 | 0.0 | 0.0 | 0 | 14482 | 2.182019 | 100.0 | 2.182019 |
| 14505 | 1027 | 4753 | G01 | NELL2 | NaN | 2.654557 | NaN | NaN | NaN | 2.654557 | 0.0 | 0.0 | 0 | 14483 | 2.181868 | 100.0 | 2.181868 |
| 14506 | 100015186 | 84435 | G08 | GPR123 | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0 | 14484 | 2.181718 | 100.0 | 2.181718 |
14507 rows Ć 17 columns
InĀ [126]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', 'Cisplatin_Resistance', '5-FU_Resistance']], left_on='NCBI', right_on='Entrez', how='left')
# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)
# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()
# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', 'Cisplatin_Resistance', '5-FU_Resistance', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')
# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)
# Replace NaN values in #favorable_x with values from #favorable_y if NaN
zscore_merge_df['Cisplatin_Resistance_x'] = zscore_merge_df['Cisplatin_Resistance_x'].fillna(zscore_merge_df['Cisplatin_Resistance_y'])
# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['5-FU_Resistance_x'] = zscore_merge_df['5-FU_Resistance_x'].fillna(zscore_merge_df['5-FU_Resistance_y'])
# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', 'Cisplatin_Resistance_y', '5-FU_Resistance_y'], axis=1, inplace=True)
# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'Cisplatin_Resistance_x': 'Cisplatin_Resistance', '5-FU_Resistance_x': '5-FU_Resistance'}, inplace=True)
# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['Cisplatin_Resistance'] >= 1 else 0, axis=1)
# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()
# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
zscore_merge_df['Precision'] = 0
zscore_merge_df['Recall'] = 0
zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
zscore_merge_df['Rank'] = None
zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)
# Initialize variables
precision_values = []
recall_values = []
precision_cumulative_hit = 0
recall_cumulative_hit = 0
hit_count = zscore_merge_df['Hit'].sum()
# Compute Precision and Recall
for idx, row in zscore_merge_df.iterrows():
if idx < first_hit_index:
precision_values.append(None)
recall_values.append(None)
continue
precision_cumulative_hit += row['Hit']
recall_cumulative_hit += row['Hit']
precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
recall = recall_cumulative_hit / hit_count * 100
precision_values.append(precision)
recall_values.append(recall)
zscore_merge_df['Precision'] = precision_values
zscore_merge_df['Recall'] = recall_values
# Calculate cumulative max of precision
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Save output to Excel
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_Cisplatin_Resistance_PR.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)
Cisplatin_Resistance_df = zscore_merge_df.copy()
# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']
# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()
# Print F1 score where precision ā recall
print(f"F1 score where Precision ā Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")
# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))
color_prec = '#dc143cff'
color_recall = '#003e98ff'
# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('Cisplatin Resistance PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()
# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')
# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')
# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_Cisplatin_Resistance_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")
plt.show()
F1 score where Precision ā Recall: 10.94 Corresponding Rank: 1370 RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Cisplatin_Resistance_PR.svg
InĀ [127]:
Cisplatin_Resistance_df
Out[127]:
| ORF_ID | NCBI | Group | Gene_Symbol | Batch 1 | Batch 2 | Batch 3 | Batch 4 | Batch 5 | Combined_Z_Score | Cisplatin_Resistance | 5-FU_Resistance | Hit | Rank | Precision | Recall | Max(Precision) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 53066 | 7691 | G04 | ZNF132 | -1.351889 | 5.791197 | 4.831242 | 1.339717 | 1.311387 | 9.314892 | 0 | 0 | 0 | None | NaN | NaN | NaN |
| 1 | 100000022 | 84133 | G08 | ZNRF3 | 2.472490 | -1.661915 | 4.844651 | 4.483725 | 1.916883 | 8.681986 | 1 | 0 | 1 | 1 | 100.000000 | 0.074516 | 100.000000 |
| 2 | 100010472 | 6051 | G04 | RNPEP | 3.737812 | 1.740719 | 6.325549 | -1.607528 | -0.132890 | 8.326448 | 0 | 0 | 0 | 2 | 100.000000 | 0.074516 | 100.000000 |
| 3 | 5102 | 55214 | G05 | LEPREL1 | -0.889157 | 0.244954 | 4.972440 | 4.312032 | 1.626191 | 7.261316 | 0 | 0 | 0 | 3 | 100.000000 | 0.074516 | 100.000000 |
| 4 | 9924 | 22795 | G06 | NID2 | 2.166479 | -0.034532 | 5.174375 | 3.790967 | -0.653718 | 7.253175 | 0 | 0 | 0 | 4 | 100.000000 | 0.074516 | 100.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14502 | 53867 | 5290 | G06 | PIK3CA | 0.109405 | NaN | NaN | NaN | NaN | 0.109405 | 0 | 0 | 0 | 14502 | 9.253896 | 100.000000 | 9.253896 |
| 14503 | 10422 | 7786 | G02 | MAP3K12 | -0.360605 | NaN | NaN | NaN | NaN | -0.360605 | 0 | 0 | 0 | 14503 | 9.253258 | 100.000000 | 9.253258 |
| 14504 | 71971 | 3747 | G08 | KCNC2 | NaN | NaN | NaN | NaN | -1.099003 | -1.099003 | 0 | 0 | 0 | 14504 | 9.252620 | 100.000000 | 9.252620 |
| 14505 | 7993 | 6170 | G02 | RPL39 | NaN | NaN | NaN | NaN | -2.406590 | -2.406590 | 0 | 0 | 0 | 14505 | 9.251982 | 100.000000 | 9.251982 |
| 14506 | 100015186 | 84435 | G08 | GPR123 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 14506 | 9.251344 | 100.000000 | 9.251344 |
14507 rows Ć 17 columns
InĀ [128]:
# Induce dataframe from TCGA
TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20250718_ORFeome_PR_Curves.xlsx", sheet_name="Evidence")
# TCGA_data = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/Reference/hORFeome9.1/20221003_HPA_Pathology.xlsx", sheet_name="#Evidence")
TCGA_data['Entrez'].astype(float)
zscore_merge_df = pd.merge(zscore_data_compile_df_sorted, TCGA_data[['Entrez', 'Cisplatin_Resistance', '5-FU_Resistance']], left_on='NCBI', right_on='Entrez', how='left')
# # Drop the duplicate "Entrez" column
# zscore_merge_df.drop('Entrez', axis=1, inplace=True)
# Identify rows with no match based on Entrez
unmatched_rows = zscore_merge_df['Entrez'].isnull()
# Merge based on Gene for unmatched rows
zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'] = zscore_merge_df.loc[unmatched_rows, 'Gene_Symbol'].apply(lambda x: x.upper())
zscore_merge_df = pd.merge(zscore_merge_df, TCGA_data[['Entrez', 'Cisplatin_Resistance', '5-FU_Resistance', 'Gene name']], left_on='Gene_Symbol', right_on='Gene name', how='left')
# Drop the duplicate "Entrez" column
zscore_merge_df.drop('Entrez_y', axis=1, inplace=True)
zscore_merge_df.rename(columns={'Entrez_x': 'Entrez'}, inplace=True)
# Replace NaN values in #favorable_x with values from #favorable_y if NaN
zscore_merge_df['Cisplatin_Resistance_x'] = zscore_merge_df['Cisplatin_Resistance_x'].fillna(zscore_merge_df['Cisplatin_Resistance_y'])
# Replace NaN values in #unfavorable_x with values from #unfavorable_y if NaN
zscore_merge_df['5-FU_Resistance_x'] = zscore_merge_df['5-FU_Resistance_x'].fillna(zscore_merge_df['5-FU_Resistance_y'])
# Drop the unwanted columns
zscore_merge_df.drop(['Entrez', 'Gene name', 'Cisplatin_Resistance_y', '5-FU_Resistance_y'], axis=1, inplace=True)
# Rename the merged columns to the original column names
zscore_merge_df.rename(columns={'Cisplatin_Resistance_x': 'Cisplatin_Resistance', '5-FU_Resistance_x': '5-FU_Resistance'}, inplace=True)
# Mark the unfavourable to hit
zscore_merge_df['Hit'] = zscore_merge_df.apply(lambda row: 1 if row['5-FU_Resistance'] >= 1 else 0, axis=1)
# Find first hit index
first_hit_index = zscore_merge_df[zscore_merge_df['Hit'] == 1].index.min()
# Set Rank starting from the first Hit
if pd.isna(first_hit_index):
zscore_merge_df['Precision'] = 0
zscore_merge_df['Recall'] = 0
zscore_merge_df['Rank'] = range(1, len(zscore_merge_df) + 1)
else:
zscore_merge_df['Rank'] = None
zscore_merge_df.loc[first_hit_index:, 'Rank'] = range(1, len(zscore_merge_df) - first_hit_index + 1)
# Initialize variables
precision_values = []
recall_values = []
precision_cumulative_hit = 0
recall_cumulative_hit = 0
hit_count = zscore_merge_df['Hit'].sum()
# Compute Precision and Recall
for idx, row in zscore_merge_df.iterrows():
if idx < first_hit_index:
precision_values.append(None)
recall_values.append(None)
continue
precision_cumulative_hit += row['Hit']
recall_cumulative_hit += row['Hit']
precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
recall = recall_cumulative_hit / hit_count * 100
precision_values.append(precision)
recall_values.append(recall)
zscore_merge_df['Precision'] = precision_values
zscore_merge_df['Recall'] = recall_values
# Calculate cumulative max of precision
zscore_merge_df['Max(Precision)'] = zscore_merge_df['Precision'].iloc[::-1].cummax()[::-1]
# Save output to Excel
zscore_merge_path = os.path.join(database_files_stats, f"{directory}_5-FU_Resistance_PR.xlsx")
zscore_merge_df.to_excel(zscore_merge_path, index=False)
FU_Resistance_df = zscore_merge_df.copy()
# Assuming you have the 'Rank', 'Precision', and 'Recall' columns in the DataFrame
rank = zscore_merge_df['Rank']
precision = zscore_merge_df['Max(Precision)']
recall = zscore_merge_df['Recall']
# Compute F1 scores
f1_scores = 2 * (precision * recall) / (precision + recall)
# Find the index where precision and recall are closest
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()
# Print F1 score where precision ā recall
print(f"F1 score where Precision ā Recall: {f1_val:.2f}")
print(f"Corresponding Rank: {int(rank_at_f1)}")
# Plot Precision on primary y-axis
fig, ax1 = plt.subplots(figsize=(8, 8))
color_prec = '#dc143cff'
color_recall = '#003e98ff'
# Plot Precision
ax1.plot(rank, recall, label='Precision', linewidth=7.0, color=color_recall)
ax1.set_xlabel('Rank (log scale)')
ax1.set_ylabel('Precision')
ax1.tick_params(axis='y')
ax1.set_xscale('log')
ax1.set_ylim(0, 105)
ax1.set_title('5-FU Resistance PR Curve')
ax1.grid(False)
ax1.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
# ax1.set_facecolor('#FAFAFA')
ax1.minorticks_off()
# Horizontal line at minimum precision
ax1.axhline(y=min_precision, color='black', linestyle='--', label=f'Min Precision = {min_precision:.2f}')
# Plot Recall on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(rank, precision, label='Recall', linewidth=7.0, color=color_prec)
ax2.set_ylabel('Recall')
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y')
# Define the file path for storing graph
PR_path = os.path.join(graphs_files_stats, f"{directory}_5-FU_Resistance_PR.svg")
plt.savefig(PR_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"{directory}_PR_Curve.svg to {PR_path}")
plt.show()
F1 score where Precision ā Recall: 8.03 Corresponding Rank: 1061 RQ023682_PR_Curve.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_5-FU_Resistance_PR.svg
InĀ [129]:
FU_Resistance_df
Out[129]:
| ORF_ID | NCBI | Group | Gene_Symbol | Batch 1 | Batch 2 | Batch 3 | Batch 4 | Batch 5 | Combined_Z_Score | Cisplatin_Resistance | 5-FU_Resistance | Hit | Rank | Precision | Recall | Max(Precision) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 53066 | 7691 | G04 | ZNF132 | -1.351889 | 5.791197 | 4.831242 | 1.339717 | 1.311387 | 9.314892 | 0 | 0 | 0 | None | NaN | NaN | NaN |
| 1 | 100000022 | 84133 | G08 | ZNRF3 | 2.472490 | -1.661915 | 4.844651 | 4.483725 | 1.916883 | 8.681986 | 1 | 0 | 0 | None | NaN | NaN | NaN |
| 2 | 100010472 | 6051 | G04 | RNPEP | 3.737812 | 1.740719 | 6.325549 | -1.607528 | -0.132890 | 8.326448 | 0 | 0 | 0 | None | NaN | NaN | NaN |
| 3 | 5102 | 55214 | G05 | LEPREL1 | -0.889157 | 0.244954 | 4.972440 | 4.312032 | 1.626191 | 7.261316 | 0 | 0 | 0 | None | NaN | NaN | NaN |
| 4 | 9924 | 22795 | G06 | NID2 | 2.166479 | -0.034532 | 5.174375 | 3.790967 | -0.653718 | 7.253175 | 0 | 0 | 0 | None | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 14502 | 53867 | 5290 | G06 | PIK3CA | 0.109405 | NaN | NaN | NaN | NaN | 0.109405 | 0 | 0 | 0 | 14496 | 7.064018 | 100.0 | 7.064018 |
| 14503 | 10422 | 7786 | G02 | MAP3K12 | -0.360605 | NaN | NaN | NaN | NaN | -0.360605 | 0 | 0 | 0 | 14497 | 7.063530 | 100.0 | 7.063530 |
| 14504 | 71971 | 3747 | G08 | KCNC2 | NaN | NaN | NaN | NaN | -1.099003 | -1.099003 | 0 | 0 | 0 | 14498 | 7.063043 | 100.0 | 7.063043 |
| 14505 | 7993 | 6170 | G02 | RPL39 | NaN | NaN | NaN | NaN | -2.406590 | -2.406590 | 0 | 0 | 0 | 14499 | 7.062556 | 100.0 | 7.062556 |
| 14506 | 100015186 | 84435 | G08 | GPR123 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | 0 | 0 | 14500 | 7.062069 | 100.0 | 7.062069 |
14507 rows Ć 17 columns
InĀ [130]:
# Store your DataFrames in a dictionary
df_dict = {
# "Old HPA Unfavorable": Old_HPA_unfavorable_df.copy(),
"New HPA Unfavorable": New_HPA_unfavorable_df.copy(),
"New HPA Favorable": HPA_new_favorable_df.copy(),
"Cervical Unfavorable": HPA_Cervical_Unfavor_df.copy(),
"Cervical Favorable": HPA_Cervical_favor_df.copy(),
"Cisplatin Resistance": Cisplatin_Resistance_df.copy(),
"5-FU Resistance": FU_Resistance_df.copy(),# or Cisplatin_Resistance_df if you prefer
}
# Helper function to compute precision, recall, and max precision
def compute_pr_metrics(df):
df = df.copy()
first_hit_index = df[df['Hit'] == 1].index.min()
if pd.isna(first_hit_index):
df['Precision'] = 0
df['Recall'] = 0
df['Rank'] = range(1, len(df) + 1)
else:
df['Rank'] = None
df.loc[first_hit_index:, 'Rank'] = range(1, len(df) - first_hit_index + 1)
precision_values = []
recall_values = []
precision_cumulative_hit = 0
recall_cumulative_hit = 0
hit_count = df['Hit'].sum()
for idx, row in df.iterrows():
if idx < first_hit_index:
precision_values.append(None)
recall_values.append(None)
continue
precision_cumulative_hit += row['Hit']
recall_cumulative_hit += row['Hit']
precision = 100 if precision_cumulative_hit == 1 else (precision_cumulative_hit / row['Rank'] * 100)
recall = recall_cumulative_hit / hit_count * 100
precision_values.append(precision)
recall_values.append(recall)
df['Precision'] = precision_values
df['Recall'] = recall_values
df['Max(Precision)'] = df['Precision'].iloc[::-1].cummax()[::-1]
return df
# Plot setup
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(24, 14))
axes = axes.flatten()
color_prec = '#dc143cff'
color_recall = '#003e98ff'
# Loop through dataframes and axes
for ax, (title, df) in zip(axes, df_dict.items()):
df = compute_pr_metrics(df)
rank = df['Rank']
precision = df['Max(Precision)']
recall = df['Recall'].apply(lambda x: float(f"{x:.2g}")) # 2 significant figures
# Compute F1 score and location
f1_scores = 2 * (precision * recall) / (precision + recall)
diff = np.abs(precision - recall)
min_diff_idx = diff.idxmin()
f1_val = f1_scores[min_diff_idx]
rank_at_f1 = rank[min_diff_idx]
min_precision = precision.min()
# Calculate hit percentage in top 10%
top_10_cutoff = int(len(df) * 0.1)
top_10_hits = df.iloc[:top_10_cutoff]['Hit'].sum()
total_hits = df['Hit'].sum()
hit_pct_top10 = (top_10_hits / top_10_cutoff * 100) if total_hits else 0
# Plot precision on primary y-axis
ax.plot(rank, precision, label='Precision', linewidth=4.0, color=color_prec)
ax.set_xlabel('Rank (log scale)')
ax.set_ylabel('Precision (%)', color=color_prec)
ax.set_xscale('log')
ax.set_ylim(0, 105)
ax.set_xlim(rank.min(), rank.max())
ax.xaxis.set_major_formatter(LogFormatter(labelOnlyBase=True))
ax.tick_params(axis='x', which='both', bottom=True, top=False)
ax.tick_params(axis='y', labelcolor=color_prec)
ax.axhline(y=min_precision, color='black', linestyle='--', linewidth=1.5)
# Remove minor ticks manually (important fix)
ax.xaxis.set_minor_locator(plt.NullLocator())
# Plot recall on secondary y-axis
ax2 = ax.twinx()
ax2.plot(rank, recall, label='Recall', linewidth=4.0, color=color_recall)
ax2.set_ylabel('Recall (%)', color=color_recall)
ax2.set_ylim(0, 105)
ax2.tick_params(axis='y', labelcolor=color_recall)
# Title with F1 and Top 10% hit info
ax.set_title(f'{title} PR Curve\nF1 ā {f1_val:.2f} @ Rank {int(rank_at_f1)}\nTop 10% Hit Rate: {hit_pct_top10:.1f}%')
# Adjust layout
plt.tight_layout()
plt.suptitle(f'Precision-Recall Curves {Chosen_combining_methods}', fontsize=40, y=1.1)
plt.subplots_adjust(top=0.9)
plt.show()
InĀ [131]:
top_10_n = int(len(HPA_Cervical_Unfavor_df ['Rank']) * 0.10)
top10_df = HPA_Cervical_Unfavor_df .iloc[:top_10_n]
observed_hits = top10_df['Hit'].sum()
rest_df = HPA_Cervical_Unfavor_df .iloc[top_10_n:]
rest_n = int(len(HPA_Cervical_Unfavor_df['Rank']) * 0.90)
InĀ [132]:
observed_hits
Out[132]:
np.int64(29)
InĀ [133]:
rest_hits = np.array(rest_df['Hit'])
random_indices = np.random.choice(len(rest_hits), size=top_10_n, replace=True)
# Sum of hits for the randomly selected subset
random_hits_sum = rest_hits[random_indices].sum()
print(random_hits_sum)
33
Kaplan-Meier Survival¶
InĀ [134]:
from lifelines import KaplanMeierFitter, statistics
InĀ [135]:
# def find_best_cutoff(df, fpkm_col='pTPM', time_col='Days', status_col='Status'):
# df = df[[fpkm_col, time_col, status_col]].dropna()
# cutoffs = np.percentile(df[fpkm_col], np.arange(20, 81)) # avoid extremes
# best_p = 1.0
# best_cut = None
# for cut in cutoffs:
# df['Group'] = df[fpkm_col] > cut
# try:
# groups = df.groupby('Group')
# T1, E1 = groups.get_group(True)[time_col], groups.get_group(True)[status_col]
# T2, E2 = groups.get_group(False)[time_col], groups.get_group(False)[status_col]
# result = statistics.logrank_test(T1, T2, E1, E2)
# if result.p_value < best_p:
# best_p = result.p_value
# best_cut = cut
# except KeyError:
# continue # if one group has no samples
# return best_cut, best_p
InĀ [136]:
from matplotlib.lines import Line2D
surival_df = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="CCDC47 - 5FU Liver")
# Clean and preprocess data
surival_df['Status'] = surival_df['Status'].astype(str).str.strip().str.lower().map({'dead': 1, 'alive': 0})
surival_df['Days'] = surival_df['Days'].str.extract(r'(\d+)').astype(float)
surival_df['Years'] = surival_df['Days'] / 365.25 # Convert to years
surival_df['pTPM'] = surival_df['pTPM'].astype(float)
# Divide into High and Low based on cutoff from HPA
Cutoff = 84.06 # From HPA webpage 20250731
logrankP = 0.00078 # From HPA webpage 20250731
surival_df['pTPM_group'] = surival_df['pTPM'].apply(lambda x: 'High' if x > Cutoff else 'Low')
# Define colors and line styles for groups
colors = {'High': '#007A03', 'Low': '#FFA90E'}
# Plot KM curves
kmf = KaplanMeierFitter()
plt.figure(figsize=(8, 8))
# Plot and collect legend handles
legend_handles = []
for group in ['High', 'Low']:
group_data = surival_df[surival_df['pTPM_group'] == group]
label = f"{group} (n={len(group_data)})"
kmf.fit(durations=group_data['Years'], event_observed=group_data['Status'], label=label)
ax = kmf.plot_survival_function(ci_show=False, color=colors[group], linewidth=7)
# Create custom legend handle
handle = Line2D([0], [0], color=colors[group], lw=7, label=label)
legend_handles.append(handle)
# Add dummy lines for cutoff and p-value (invisible but shows in legend)
legend_handles.append(Line2D([0], [0], color='none', label=f"Cutoff = {Cutoff}"))
legend_handles.append(Line2D([0], [0], color='none', label=f"P = {logrankP:.5f}"))
# Final plot adjustments
plt.title("CESC Survival by PGM1 expression", fontsize= 25)
plt.grid(False)
plt.xlabel("Time (years)", fontsize=25)
plt.xticks(fontsize=25)
plt.yticks(fontsize=25)
plt.ylabel("Survival Probability", fontsize=25)
plt.ylim(0, 1.05)
plt.xlim(0, 18)
plt.tight_layout()
# Add custom legend with all handles
plt.legend(handles=legend_handles, loc='upper right', frameon=False, fontsize = 20)
plt.show()
InĀ [137]:
def plot_kaplan_meier(df, gene_name, cutoff, logrank_p, title,
colors=None, figsize=(8, 8)):
"""
Plots a KaplanāMeier curve for High/Low expression groups based on pTPM cutoff.
Parameters:
df (DataFrame): Must contain 'Status', 'Days', and 'pTPM' columns.
gene_name (str): Gene name (for annotation only).
cutoff (float): Expression cutoff to define High/Low groups.
logrank_p (float): Pre-computed log-rank p-value.
title (str): Title of the plot.
colors (dict): Optional dict like {'High': '#FF844D', 'Low': '#4DFFED'}.
figsize (tuple): Size of the plot.
max_years (int or float): X-axis upper limit in years.
"""
if colors is None:
colors = {'High': '#007A03', 'Low': '#FFA90E'}
# Preprocess input data
df = df.copy()
df['Status'] = df['Status'].astype(str).str.strip().str.lower().map({'dead': 1, 'alive': 0})
df['Days'] = df['Days'].astype(str).str.extract(r'(\d+)').astype(float)
df['Years'] = df['Days'] / 365.25
df['pTPM'] = df['pTPM'].astype(float)
df = df.dropna(subset=['Years', 'Status', 'pTPM'])
# Grouping based on cutoff
df['pTPM_group'] = df['pTPM'].apply(lambda x: 'High' if x > cutoff else 'Low')
max_years = df['Years'].max()
# Plot setup
plt.figure(figsize=figsize)
kmf = KaplanMeierFitter()
group_handles = {}
for group in ['Low', 'High']:
group_data = df[df['pTPM_group'] == group]
label = f"{group} (n={len(group_data)})"
kmf.fit(durations=group_data['Years'], event_observed=group_data['Status'], label=label)
kmf.plot_survival_function(ci_show=False, color=colors[group], linewidth=7)
handle = Line2D([0], [0], color=colors[group], lw=7, label=label)
group_handles[group] = handle
# Add cutoff and p-value as text-only legend entries
legend_handles = [
group_handles['High'],
group_handles['Low'],
Line2D([0], [0], color='none', label=f"p = {logrank_p:.5f}")
]
fontsize = 25
# Final plot settings
plt.title(title, fontsize=fontsize)
plt.xlabel("Time (years)", fontsize=fontsize)
plt.ylabel("Survival Probability",fontsize=fontsize)
plt.ylim(0.05, 1.05)
plt.xlim(0, max_years*1.05)
plt.yticks(fontsize=fontsize)
plt.xticks(fontsize=fontsize)
plt.grid(False)
plt.tight_layout()
plt.legend(handles=legend_handles, loc='upper right', frameon=False, fontsize=20)
os.makedirs(graphs_files_stats, exist_ok=True)
filename = f"{directory}_{gene_name}.svg"
path = os.path.join(graphs_files_stats, filename)
plt.savefig(path, format='svg', dpi=1000)
print(f"Saved: {filename} to {path}")
plt.show()
InĀ [138]:
CCDC47 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="CCDC47 - 5FU Liver")
SH3BP2 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="SH3BP2 - Cis Pan")
SLIRP = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="SLIRP - Cer Fav")
PGM1 = pd.read_excel(f"/home/{user_id}/rnaseq_analysis/RQ023682/RQ023682_db/20250720_Survival.xlsx", sheet_name="PGM1 - Cer Unfav")
InĀ [139]:
plot_kaplan_meier(CCDC47, gene_name="CCDC47", cutoff=84.06, logrank_p=0.00078,
title="LIHC CCDC47 Expression")
plot_kaplan_meier(SH3BP2, gene_name="SH3BP2", cutoff=13.82, logrank_p=0.017,
title="PAAD SH3BP2 Expression")
plot_kaplan_meier(SLIRP, gene_name="SLIRP", cutoff=162.41, logrank_p=0.000017,
title="CESC SLIRP Expression")
plot_kaplan_meier(PGM1, gene_name="PGM1", cutoff=62.51, logrank_p=0.00033,
title="CESC PGM1 Expression")
Saved: RQ023682_CCDC47.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_CCDC47.svg
Saved: RQ023682_SH3BP2.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_SH3BP2.svg
Saved: RQ023682_SLIRP.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_SLIRP.svg
Saved: RQ023682_PGM1.svg to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_PGM1.svg
InĀ [140]:
Zscore_values = FU_Resistance_df[['ORF_ID', 'NCBI', 'Gene_Symbol', 'Combined_Z_Score' ]].copy().sort_values(by='Combined_Z_Score', ascending=False)
# Zscore_values = FU_Resistance_df[['ORF_ID', 'NCBI', 'Gene_Symbol', 'Combined_Z_Score' ]].copy()
InĀ [141]:
Zscore_values_clean = Zscore_values.dropna(subset=['Combined_Z_Score']).copy()
Zscore_values_clean['Rank'] = np.arange(1, len(Zscore_values_clean) + 1)
plt.figure(figsize=(7, 7)) # Set the figure size for better readability
# Scatter plot: Each point represents a gene
# X-axis: Rank (derived from the sorted index)
# Y-axis: Combined_Z_Score
plt.scatter(Zscore_values_clean['Rank'], Zscore_values_clean['Combined_Z_Score'],
s=10, # Size of the points
alpha=0.6, # Transparency of the points
color='grey', # Color of the points
label='Gene Z-Score')
# Optional: Add a line connecting the points if you want to emphasize the trend
# plt.plot(Zscore_values_clean['Rank'], Zscore_values_clean['Combined_Z_Score'],
# color='red', linestyle='-', linewidth=0.5, alpha=0.5, label='Z-Score Trend')
genes_to_highlight = ['CCDC47', 'SH3BP2', 'PGM1', 'SLIRP']
highlight_data = Zscore_values_clean[
Zscore_values_clean['Gene_Symbol'].isin(genes_to_highlight)
].copy()
# --- Highlighted Genes Plot ---
# Using a larger size, different color, and a border to make them stand out
plt.scatter(highlight_data['Rank'], highlight_data['Combined_Z_Score'],
s=100, # Larger size
alpha=1, # Fully opaque
color='red', # Distinct color
edgecolor='black', # Black border
zorder=5, # Ensure these points are on top of others
label='Highlighted Genes')
# Add annotations for highlighted genes
for i, row in highlight_data.iterrows():
plt.annotate(
row['Gene_Symbol'],
(row['Rank'], row['Combined_Z_Score']),
xytext=(5, 5), # Offset text slightly from the point
textcoords='offset points',
fontsize=9,
color='darkred',
ha='left', # Horizontal alignment
va='bottom' # Vertical alignment
)
# Add labels and title
plt.xlabel('Gene Rank')
plt.ylabel('Z-Score')
plt.title('Gene Rank Plot: Combined Z-Score vs. Rank')
# Add a horizontal line at Z-score = 0 for reference
plt.axhline(y=0, color='gray', linestyle='--', linewidth=1, label='Z-Score = 0')
# Customize ticks and grid for better readability
plt.xticks(np.linspace(1, len(Zscore_values_clean), 5, dtype=int), rotation=45, ha="right") # Show 10 ticks evenly spaced
plt.grid(True, linestyle='--', alpha=0.7)
# Add a legend
# plt.legend()
# Display the plot
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
Pearson Correlation Coefficient analysis¶
InĀ [142]:
#Importing tqdm for jupyter notebook
from tqdm.notebook import tqdm
# Check if the result files already exist
PCC_testing_path = os.path.join(database_files_stats, f"{directory}_PCC_Result_Full.csv")
PCC_filter_path = os.path.join(database_files_stats, f"{directory}_PCC_Result_filter.xlsx")
if os.path.isfile(PCC_testing_path) and os.path.isfile(PCC_filter_path):
print("Result files already exist. Skipping PCC calculation.")
else:
# Continue with your existing code for PCC calculation
# Change dataframe to Fold Changes only
FC_columns = Final_pval_df[['NCBI'] + list(Final_pval_df.columns[Final_pval_df.columns.str.startswith('FC_')])]
FC_columns.set_index('NCBI', inplace=True)
# Function to calculate PCC and p-value for a pair of genes
def calculate_pcc_and_pval(gene_pair):
gene_a, gene_b = gene_pair
expression_a = FC_columns.loc[gene_a].values
expression_b = FC_columns.loc[gene_b].values
pcc, p_val = pearsonr(expression_a, expression_b)
return gene_a, gene_b, pcc, p_val
# Get all unique combinations of genes
gene_combinations = list(combinations(FC_columns.index, 2))
# Use multiprocessing for parallel computation
num_cores = cpu_count()
with Pool(num_cores) as pool:
results = list(tqdm(pool.imap(calculate_pcc_and_pval, gene_combinations), total=len(gene_combinations), desc='Processing'))
# Create a DataFrame from the results
result_values = pd.DataFrame(results, columns=['Gene A', 'Gene B', 'PCC', 'P-value'])
# Save a full result into CSV file
result_values.to_csv(PCC_testing_path, index=False)
# Get the filtered result
PCC_filter = result_values[(result_values['PCC'] > 0.9) | (result_values['PCC'] < -0.9) & (result_values['P-value'] < 0.05)]
# Save a filtered result into excel file
PCC_filter.to_excel(PCC_filter_path, index=False)
Result files already exist. Skipping PCC calculation.
Spearmen Correlation Coefficient analysis¶
InĀ [143]:
#Importing tqdm for jupyter notebook
from tqdm.notebook import tqdm
# Check if the result files already exist
SCC_testing_path = os.path.join(database_files_stats, f"{directory}_SCC_Result_Full.csv")
SCC_filter_path = os.path.join(database_files_stats, f"{directory}_SCC_Result_filter.xlsx")
if os.path.isfile(SCC_testing_path) and os.path.isfile(SCC_filter_path):
print("Result files already exist. Skipping PCC calculation.")
else:
# Continue with your existing code for PCC calculation
# Change dataframe to Fold Changes only
FC_columns = Final_pval_df[['NCBI'] + list(Final_pval_df.columns[Final_pval_df.columns.str.startswith('FC_')])]
FC_columns.set_index('NCBI', inplace=True)
# Function to calculate PCC and p-value for a pair of genes
def calculate_scc_and_pval(gene_pair):
gene_a, gene_b = gene_pair
expression_a = FC_columns.loc[gene_a].values
expression_b = FC_columns.loc[gene_b].values
pcc, p_val = spearmanr(expression_a, expression_b)
return gene_a, gene_b, pcc, p_val
# Get all unique combinations of genes
gene_combinations = list(combinations(FC_columns.index, 2))
# Use multiprocessing for parallel computation
num_cores = cpu_count()
with Pool(num_cores) as pool:
results = list(tqdm(pool.imap(calculate_scc_and_pval, gene_combinations), total=len(gene_combinations), desc='Processing'))
# Create a DataFrame from the results
result_values = pd.DataFrame(results, columns=['Gene A', 'Gene B', 'PCC', 'P-value'])
# Save a full result into CSV file
result_values.to_csv(SCC_testing_path, index=False)
# Get the filtered result
PCC_filter = result_values[(result_values['PCC'] > 0.9) | (result_values['PCC'] < -0.9) & (result_values['P-value'] < 0.05)]
# Save a filtered result into excel file
PCC_filter.to_excel(SCC_filter_path, index=False)
Result files already exist. Skipping PCC calculation.
Upset Graph¶
InĀ [144]:
# mRNA_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Dox_vs_DMSO_LogFC_pval_v2.xlsx")
# from Bio import Entrez
# def get_entrez_id_from_gene_name(gene_name):
# if not gene_name:
# return 'N/A' # Handle empty gene names
# try:
# handle = Entrez.esearch(db="gene", term=f"{gene_name}[Gene Name] AND human[Organism]", retmax="1")
# record = Entrez.read(handle)
# handle.close()
# if record["IdList"]:
# # Entrez IDs are returned as strings in the IdList
# return record["IdList"][0]
# else:
# return "N/A" # Gene not found
# except Exception as e:
# print(f"Error fetching Entrez ID for {gene_name}: {e}")
# return "Error" # Indicate an error occurred
# entrez_ids = []
# for index, row in mRNA_df.iterrows():
# gene_name = row['gene_id']
# entrez_id = get_entrez_id_from_gene_name(gene_name)
# entrez_ids.append(entrez_id)
# mRNA_df['entrez_acc_number'] = entrez_ids
# mRNA_df.to_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Dox_vs_DMSO_LogFC_pval_v2_with_entrez.xlsx", index=False)
InĀ [145]:
mRNA_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/Dox_vs_DMSO_LogFC_pval_v2_with_entrez.xlsx")
InĀ [146]:
Dox_main = Final_pval[["NCBI", "FC_Doxorubicin", "ovp3_Doxorubicin"]]
Dox_mRNA = mRNA_df[["Entrez_ID", "Full_log2FC", "Full_ppval", "24hr_log2FC", "24hr_ppval"]]
InĀ [147]:
Dox_main_all = Final_pval["NCBI"].dropna().astype(int).unique().tolist()
Dox_mRNA_all = mRNA_df["Entrez_ID"].dropna().astype(int).unique().tolist()
InĀ [148]:
# DEG list 1 from Final_pval based on FC_Doxorubicin and ovp3_Doxorubicin
Main_DEGs = Final_pval[
((Final_pval["FC_Doxorubicin"] > 0.5) | (Final_pval["FC_Doxorubicin"] < -0.5)) &
(Final_pval["ovp3_Doxorubicin"] < 0.05)
]["NCBI"].dropna().astype(int).unique().tolist()
# DEG list 2 from mRNA_df based on Full_log2FC and Full_ppval
Full_DEGs = mRNA_df[
((mRNA_df["Full_log2FC"] > 0.5) | (mRNA_df["Full_log2FC"] < -0.5)) &
(mRNA_df["Full_ppval"] < 0.05)
]["Entrez_ID"].dropna().astype(int).unique().tolist()
# DEG list 3 from mRNA_df based on 24hr_log2FC and 24hr_ppval
short_DEGs = mRNA_df[
((mRNA_df["24hr_log2FC"] > 0.5) | (mRNA_df["24hr_log2FC"] < -0.5)) &
(mRNA_df["24hr_ppval"] < 0.05)
]["Entrez_ID"].dropna().astype(int).unique().tolist()
InĀ [149]:
from upsetplot import UpSet, from_memberships
import matplotlib.pyplot as plt
InĀ [150]:
# import warnings
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=UserWarning)
# # Optional: Set global font style
# mpl.rcParams.update({
# 'font.size': 12, # Base font size
# 'axes.titlesize': 16, # Title font
# 'axes.labelsize': 14, # Axis labels
# 'xtick.labelsize': 12, # Tick labels
# 'ytick.labelsize': 12,
# 'legend.fontsize': 12,
# })
# # Reuse your data preparation from before
# deg_sets = {
# "ORFeome Screening": set(Dox_main_all),
# "mRNA seq": set(Dox_mRNA_all),
# "ORFeome DEGs": set(Main_DEGs),
# "120 hr screening": set(Full_DEGs),
# "24 hr screening": set(short_DEGs),
# }
# all_genes = set.union(*deg_sets.values())
# memberships = [[name for name, s in deg_sets.items() if gene in s] for gene in all_genes]
# upset_data = from_memberships(memberships)
# # Plot with adjusted layout and title
# plt.figure(figsize=(14, 8))
# up = UpSet(upset_data, subset_size='count', show_counts=True)
# up.plot()
# # Optional: Add a bigger title
# plt.suptitle("UpSet Plot of DEG List Overlaps", fontsize=18)
# # Optional: Adjust layout manually instead of tight_layout
# plt.subplots_adjust(hspace=0.4, top=0.85)
# plt.show()
InĀ [151]:
from upsetplot import UpSet, from_memberships
import matplotlib.pyplot as plt
# DEG group for UpSet plot
deg_sets = {
"Main_DEGs": set(Main_DEGs),
"Full_DEGs": set(Full_DEGs),
"short_DEGs": set(short_DEGs),
}
# Build membership from union of genes
all_genes = set.union(*deg_sets.values())
memberships = [[name for name, s in deg_sets.items() if gene in s] for gene in all_genes]
upset_data = from_memberships(memberships)
# Plot UpSet
plt.figure(figsize=(10, 6))
up = UpSet(upset_data, show_counts=True, subset_size='count')
up.plot()
plt.suptitle("Overlap Among DEGs (Main, Full, 24hr)", fontsize=16)
plt.subplots_adjust(top=0.85)
plt.show()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/data.py:303: FutureWarning:
Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:795: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:796: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:797: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:798: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
<Figure size 1000x600 with 0 Axes>
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:762: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.) /home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:763: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.) /home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:905: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.) /home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:906: DeprecationWarning: Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
InĀ [152]:
deg_sets = {
"ORFeomes": set(Main_DEGs),
"120 hr": set(Full_DEGs),
"24 hr": set(short_DEGs),
}
Dox_main_all_f = [0] * len(set(Dox_main_all))
Dox_mRNA_all_f = [99999999] * len(set(Dox_mRNA_all))
# Step 3: Combine into membership list
memberships = []
# Add fake memberships (just for bar chart)
memberships += [["Total ORFeomes"] for _ in Dox_main_all_f]
memberships += [["mRNA transcriptomics"] for _ in Dox_mRNA_all_f]
# Add real DEG set memberships
for gene in set.union(*deg_sets.values()):
gene_membership = [name for name, geneset in deg_sets.items() if gene in geneset]
memberships.append(gene_membership)
# Step 4: Build UpSet data and plot
from upsetplot import from_memberships
data = from_memberships(memberships)
upset = UpSet(data, show_counts=True, subset_size='count', sort_by='degree', orientation="vertical", min_degree=2)
plt.rcParams["font.size"] = 6
upset.plot()
# Grab the current figure and axes
fig = plt.gcf()
axes = fig.get_axes()
# Usually, the set labels are on the first or second axis ā let's check and rotate all x-tick labels
for ax in axes:
# Rotate x tick labels if they exist
if ax.get_xticklabels():
for label in ax.get_xticklabels():
label.set_rotation(270)
# label.set_ha('right')
# plt.tight_layout()
# plt.suptitle("Doxorubicin Screening", fontsize=16)
# Define the file path for storing graph
Upset_path = os.path.join(graphs_files_stats, f"{directory}_upset.svg")
plt.savefig(Upset_path , format='svg', bbox_inches='tight', dpi=1000)
plt.show()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/data.py:303: FutureWarning:
Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:795: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:796: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:797: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:798: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:763: DeprecationWarning:
Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:762: DeprecationWarning:
Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:906: DeprecationWarning:
Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:905: DeprecationWarning:
Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
InĀ [153]:
# Select the columns containing the sample data
batch_compile_df_corr = batch_compile_df.fillna(0)
columns_to_include = [str(column) for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
excludes = ("63-mCherryPositive&BFPNegative", "64-mCherryNegative&BFPNegative", "Serumfree")
columns_to_include = [col for col in columns_to_include if not any(exclude in col for exclude in excludes)]
InĀ [154]:
import re
# Stats variable reconfirmed (Overlap between R and Scipy)
stats = importr("stats")
# Select the columns containing the sample data
batch_compile_df_corr = batch_compile_df.fillna(0)
columns_to_include = [str(column) for column in nor_clean_compile_df.columns if any(any(sample in column for sample in samples) for samples in sample_key.values())]
excludes = ("63-mCherryPositive&BFPNegative", "64-mCherryNegative&BFPNegative", "Serumfree")
columns_to_include = [col for col in columns_to_include if not any(exclude in col for exclude in excludes)]
batch_compile_df_corr = batch_compile_df_corr[columns_to_include].astype(float)
# Exclude some columns
excludes = {"63-mCherryPositive&BFPNegative", "64-mCherryNegative&BFPNegative", "Serumfree"}
# Add columns containing specific patterns like 'DMSO' or 'baseline'
pattern_excludes = {'DMSO', 'baseline'}
for col in batch_compile_df_corr.columns:
if any(pattern.lower() in col.lower() for pattern in pattern_excludes):
excludes.add(col)
# Drop all the identified columns
batch_compile_df_corr_f = batch_compile_df_corr.drop(columns=list(excludes), errors='ignore')
# Use numeric columns from batch_compile_df_corr_f
numeric_df = batch_compile_df_corr_f.select_dtypes(include='number')
corr_matrix = numeric_df.corr()
# Extract drug names from column names
def extract_drug_name(col):
match = re.match(r'\d+-(.*)-[A-Z]$', col)
return match.group(1) if match else col
drug_names = [extract_drug_name(col) for col in corr_matrix.columns]
# Compute positions for one label per drug
tick_positions = []
tick_labels = []
seen = set()
for i, name in enumerate(drug_names):
if name not in seen:
tick_positions.append(i)
tick_labels.append(name)
seen.add(name)
# Plot the heatmap
fig, ax = plt.subplots(figsize=(16, 16))
heatmap = sns.heatmap(corr_matrix,
cmap='Purples',
annot=False,
linewidths=0,
cbar=False,
ax=ax)
colorbar = fig.colorbar(heatmap.get_children()[0],
ax=ax,
orientation='horizontal',
shrink=0.5,
pad=0.1,
location ='top') # pad controls spacing from the plot
# Add colorbar label
colorbar.set_label("Pearson Correlation Coefficient")
# Set simplified ticks
plt.xticks(tick_positions, tick_labels, rotation=45, ha='center')
plt.yticks(tick_positions, tick_labels, rotation=0)
Figure_PCC_Heatmap_path = os.path.join(graphs_files_original, f"{directory}_Figure_Heatmap.svg")
plt.savefig(Figure_PCC_Heatmap_path, format='svg',bbox_inches='tight', dpi=1000)
print(f"{directory}_Post_PCC_Heatmap.svg saved to {Figure_PCC_Heatmap_path }")
plt.tight_layout()
plt.show()
RQ023682_Post_PCC_Heatmap.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Original_Graph/RQ023682_Figure_Heatmap.svg
InĀ [155]:
plt.rcParams['axes.labelsize'] = 30 # For X and Y labels
plt.rcParams['xtick.labelsize'] = 30 # For X-axis tick labels
plt.rcParams['ytick.labelsize'] = 30 # For Y-axis tick labels
plt.rcParams['legend.fontsize'] = 30 # For the legend
plt.rcParams['axes.titlesize'] = 30 # For the plot title
plt.rcParams['axes.linewidth'] = 1.5 # For the axes lines
plt.rcParams['font.size'] = 30 # For all text in the plot
InĀ [156]:
Autophagy_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/RQ025080_Final_meanFC_pval.xlsx", sheet_name='Sheet1')
Autophagy_df_all = Autophagy_df["NCBI"].dropna().astype(int).unique().tolist()
Final_all = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20240522_Screening.xlsx", sheet_name='A')
InĀ [157]:
TAS_main = Final_all[['NCBI', 'Verified', 'Silencing', "FC_TAS102", "pval_TAS102"]]
InĀ [158]:
TAS_DEGs = TAS_main[
((TAS_main["FC_TAS102"] >= 0.5) | (TAS_main["FC_TAS102"] <= -0.5)) &
(TAS_main["pval_TAS102"] < 0.05)
& (TAS_main["Silencing"] == 'No Silence')
& (TAS_main["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()
InĀ [159]:
# DEG list 2 from mRNA_df based on Full_log2FC and Full_ppval
Upper_DEG = Autophagy_df[
((Autophagy_df["FC_Upper"] >=0.5) | (Autophagy_df["FC_Upper"] <= -0.5)) &
(Autophagy_df["pval_Upper"] < 0.05) &
(Autophagy_df["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()
# DEG list 3 from mRNA_df based on 24hr_log2FC and 24hr_ppval
Lower_DEG = Autophagy_df[
((Autophagy_df["FC_Lower"] >= 0.5) | (Autophagy_df["FC_Lower"] <= -0.5)) &
(Autophagy_df["pval_Lower"] < 0.05) &
(Autophagy_df["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()
InĀ [160]:
# DEG list 2 from mRNA_df based on Full_log2FC and Full_ppval
Upper_DEG_2 = Autophagy_df[
((Autophagy_df["FC_Com_Exp_Up"] >= 0.5) | (Autophagy_df["FC_Com_Exp_Up"] <= -0.5)) &
(Autophagy_df["pval_Com_Exp_Up"] < 0.05)
& (Autophagy_df["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()
# DEG list 3 from mRNA_df based on 24hr_log2FC and 24hr_ppval
Lower_DEG_2 = Autophagy_df[
((Autophagy_df["FC_Com_Exp_Lo"] >= 0.5) | (Autophagy_df["FC_Com_Exp_Lo"] <= -0.5)) &
(Autophagy_df["pval_Com_Exp_Lo"] < 0.05)
& (Autophagy_df["Verified"] == 1)
]["NCBI"].dropna().astype(int).unique().tolist()
InĀ [161]:
# All
Upper_DEG_3 = Upper_DEG + Upper_DEG_2
Lower_DEG_3 = Lower_DEG + Lower_DEG_2
auto_degs_3 = {
"Main DEGs": set(TAS_DEGs),
"Upper DEGs": set(Upper_DEG_3),
"Lower DEGs": set(Lower_DEG_3)
}
auto_all_3 = set.union(*auto_degs_3.values())
auto_memberships_3 = [[name for name, s in auto_degs_3.items() if gene in s] for gene in auto_all_3]
auto_upset_3 = from_memberships(auto_memberships_3)
# Plot UpSet
up = UpSet(auto_upset_3, show_counts=True, subset_size='count', sort_by='degree', orientation="vertical", min_degree=2)
up.plot()
Autophagy_path = os.path.join(graphs_files_stats, f"{directory}_Autophagy_Upset.svg")
plt.savefig(Autophagy_path, format='svg', bbox_inches='tight', dpi=1000)
plt.show()
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/data.py:303: FutureWarning:
Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:795: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:796: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:797: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/upsetplot/plotting.py:798: FutureWarning:
A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:763: DeprecationWarning:
Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:762: DeprecationWarning:
Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:906: DeprecationWarning:
Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
/home/harryjo/anaconda3/envs/pipeline/lib/python3.11/site-packages/matplotlib/text.py:905: DeprecationWarning:
Conversion of an array with ndim > 0 to a scalar is deprecated, and will error in future. Ensure you extract a single element from your array before performing this operation. (Deprecated NumPy 1.25.)
InĀ [162]:
HSR = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20240522_HSR.xlsx", sheet_name='B')
HSR_original = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20240522_HSR.xlsx", sheet_name='HSR')
Dox_only = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20240522_HSR.xlsx", sheet_name='HSR_only')
InĀ [163]:
# 96 genes from the HSR Doxorubicin bin
InĀ [164]:
# Dox_main_all
# Main_DEGs
InĀ [165]:
HSR_all = HSR["NCBI"].dropna().astype(int).unique().tolist()
Dox_only_all = Dox_only["NCBI"].dropna().astype(int).unique().tolist()
HSR_original_all = HSR_original["NCBI"].dropna().astype(int).unique().tolist()
HSR_DEGs = HSR[
((HSR["FC_Doxorubicin"] >= 0.5) | (HSR["FC_Doxorubicin"] <= -0.5)) &
(HSR["pval_Doxorubicin"] < 0.05)
]["NCBI"].dropna().astype(int).unique().tolist()
InĀ [166]:
Total_HSR = set(HSR_all)
Total_original_HSR = set(HSR_original_all)
intersection = Total_original_HSR.intersection(Total_HSR)
only_Total_main = Total_original_HSR - intersection
only_Total_HSR = Total_HSR - Total_HSR
fig, ax = plt.subplots(figsize=(16, 16))
venn = venn2(subsets=(len(only_Total_main), len(only_Total_HSR), len(intersection)),
set_labels=('Doxorubicin Screening', 'HSR Screening'),
ax=ax)
HSR_all_graph_path = os.path.join(graphs_files_stats, f"{directory}_HSR_all_Venn.svg")
plt.savefig(HSR_all_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Venn5_graph_path}")
plt.tight_layout()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Venn5_Graph.svg
InĀ [167]:
Dox_sets = set(Dox_only_all)
insersection_HSR = Total_HSR.intersection(Dox_sets)
only_Dox_all = Dox_sets - insersection_HSR
only_Dox = Dox_sets - Dox_sets
fig, ax = plt.subplots(figsize=(16, 16))
venn = venn2(subsets=(len(only_Dox_all), len(only_Dox), len(insersection_HSR)),
set_labels=('Doxorubicin Screening', 'HSR Screening'),
ax=ax)
HSR_Dox_graph_path = os.path.join(graphs_files_stats, f"{directory}_HSR_Dox_Venn.svg")
plt.savefig(HSR_Dox_graph_path, format='svg', bbox_inches='tight', dpi=1000)
print(f"Merged read_summary.svg saved to {Venn5_graph_path}")
plt.tight_layout()
Merged read_summary.svg saved to /home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Stats_Venn5_Graph.svg
InĀ [168]:
from gprofiler import GProfiler
# --- Step 1: Filter peripheral genes per drug (no exclusivity) ---
peripheral_GOBP = {}
for drug_name in drug_name_list:
nan_col = f'nan_filter_{drug_name}'
fc_col = f'FC_{drug_name}'
pval_col = f'ovp3_{drug_name}'
df_sub = Final_pval[
(Final_pval[nan_col] == 1) &
((Final_pval[fc_col] >= 0.5)
|
(Final_pval[fc_col] <= -0.5)) &
(Final_pval[pval_col] <= 0.05)
]
genes_this_drug = set(df_sub['NCBI'])
# Remove core genes if Core_Genes defined
if 'Core_Genes' in globals():
genes_this_drug.difference_update(set(Core_Genes))
peripheral_GOBP[drug_name] = list(map(str, genes_this_drug))
print("Peripheral gene counts per drug:")
for drug, genes in peripheral_GOBP.items():
print(f"{drug}: {len(genes)} genes")
# --- Step 2: Run GOBP enrichment for each drug ---
gp = GProfiler(return_dataframe=True)
gobp_results = {}
for drug, gene_list in peripheral_GOBP.items():
if not gene_list:
print(f"No genes for {drug}, skipping GOBP")
continue
print(f"Running GO:BP enrichment for {drug} ({len(gene_list)} genes)...")
res = gp.profile(
organism='hsapiens',
query=gene_list,
sources=['GO:BP'],
user_threshold=0.05
)
gobp_results[drug] = res
print(f" Found {len(res)} enriched GO:BP terms for {drug}")
# --- Step 3: Combine GO results into one DataFrame ---
all_gobp = []
for drug, df in gobp_results.items():
if df.empty:
continue
df = df.copy()
df['drug'] = drug
all_gobp.append(df)
combined_gobp = pd.concat(all_gobp, ignore_index=True)
combined_gobp = combined_gobp[['name', 'p_value', 'intersection_size', 'drug']]
combined_gobp.rename(columns={
'name': 'GO_term',
'p_value': 'p_adj',
'intersection_size': 'Count'
}, inplace=True)
Peripheral gene counts per drug: Paclitaxel: 1612 genes Cisplatin: 1785 genes TFT: 1625 genes FdU: 1844 genes EdU: 826 genes Doxorubicin: 1983 genes 5FU: 1789 genes Carboplatin: 1828 genes Bleomycin: 1775 genes Etoposide: 1516 genes MitomycinC: 1461 genes Carmustine: 1641 genes Irinotecan: 1841 genes 6mercaptopurine: 755 genes Vinblastine: 1329 genes TAS102: 2046 genes Running GO:BP enrichment for Paclitaxel (1612 genes)... Found 36 enriched GO:BP terms for Paclitaxel Running GO:BP enrichment for Cisplatin (1785 genes)... Found 90 enriched GO:BP terms for Cisplatin Running GO:BP enrichment for TFT (1625 genes)... Found 59 enriched GO:BP terms for TFT Running GO:BP enrichment for FdU (1844 genes)... Found 26 enriched GO:BP terms for FdU Running GO:BP enrichment for EdU (826 genes)... Found 1 enriched GO:BP terms for EdU Running GO:BP enrichment for Doxorubicin (1983 genes)...
Found 69 enriched GO:BP terms for Doxorubicin Running GO:BP enrichment for 5FU (1789 genes)... Found 63 enriched GO:BP terms for 5FU Running GO:BP enrichment for Carboplatin (1828 genes)... Found 62 enriched GO:BP terms for Carboplatin Running GO:BP enrichment for Bleomycin (1775 genes)... Found 72 enriched GO:BP terms for Bleomycin Running GO:BP enrichment for Etoposide (1516 genes)... Found 40 enriched GO:BP terms for Etoposide Running GO:BP enrichment for MitomycinC (1461 genes)... Found 53 enriched GO:BP terms for MitomycinC Running GO:BP enrichment for Carmustine (1641 genes)... Found 50 enriched GO:BP terms for Carmustine Running GO:BP enrichment for Irinotecan (1841 genes)... Found 46 enriched GO:BP terms for Irinotecan Running GO:BP enrichment for 6mercaptopurine (755 genes)... Found 15 enriched GO:BP terms for 6mercaptopurine Running GO:BP enrichment for Vinblastine (1329 genes)... Found 17 enriched GO:BP terms for Vinblastine Running GO:BP enrichment for TAS102 (2046 genes)... Found 206 enriched GO:BP terms for TAS102
InĀ [169]:
# Build mapping from Entrez ID (NCBI) to Gene Symbol
id2symbol = (
Final_pval[['NCBI', 'Gene_Symbol']]
.dropna()
.drop_duplicates('NCBI')
.set_index('NCBI')['Gene_Symbol']
.to_dict()
)
peripheral_GSEApy = {}
core = set(Core_Genes) if 'Core_Genes' in globals() else set()
for drug_name in drug_name_list:
ids = peripheral_dfs_select.get(drug_name, [])
# Convert IDs to symbols, skipping unmapped ones
symbols = [id2symbol.get(int(i)) for i in ids if int(i) in id2symbol]
# Remove None values and strip whitespace
symbols = [s.strip() for s in symbols if isinstance(s, str) and s.strip()]
# Remove core genes if needed
symbols = sorted(set(symbols) - core)
peripheral_GSEApy[drug_name] = symbols
# Optional sanity check
for d, genes in peripheral_GSEApy.items():
print(f"{d}: {len(genes)} mapped gene symbols")
Paclitaxel: 214 mapped gene symbols Cisplatin: 257 mapped gene symbols TFT: 192 mapped gene symbols FdU: 258 mapped gene symbols EdU: 174 mapped gene symbols Doxorubicin: 270 mapped gene symbols 5FU: 198 mapped gene symbols Carboplatin: 301 mapped gene symbols Bleomycin: 236 mapped gene symbols Etoposide: 270 mapped gene symbols MitomycinC: 205 mapped gene symbols Carmustine: 215 mapped gene symbols Irinotecan: 324 mapped gene symbols 6mercaptopurine: 167 mapped gene symbols Vinblastine: 218 mapped gene symbols TAS102: 333 mapped gene symbols
InĀ [879]:
# Build categorical_GSEApy from category_dfs_select
categorical_GSEApy = {}
for category_name in category_dfs_select.keys():
ids = category_dfs_select.get(category_name, [])
# Convert IDs to symbols, skipping unmapped ones
symbols = [id2symbol.get(int(i)) for i in ids if int(i) in id2symbol]
# Remove None values and strip whitespace
symbols = [s.strip() for s in symbols if isinstance(s, str) and s.strip()]
# Remove core genes if needed
symbols = sorted(set(symbols) - core)
categorical_GSEApy[category_name] = symbols
# Optional sanity check
for c, genes in categorical_GSEApy.items():
print(f"{c}: {len(genes)} mapped gene symbols")
Antimetabolite: 231 mapped gene symbols DNA cross linking agent: 247 mapped gene symbols DNA strand break agent: 342 mapped gene symbols Microtubule inhibitor: 252 mapped gene symbols
InĀ [450]:
silenced_genes = set(Final_pval_df.loc[Final_pval_df["Silencing"] == "Silenced", "NCBI"])
InĀ [Ā ]:
import gseapy as gp
# Input data should be Gene Symbols
peripheral_GSEApy = {}
for drug_name in drug_name_list:
nan_col = f'nan_filter_{drug_name}'
fc_col = f'FC_{drug_name}'
pval_col = f'ovp3_{drug_name}'
df_sub = Final_pval[
(Final_pval[nan_col] == 1) &
((Final_pval[fc_col] >= 0.5)
|
(Final_pval[fc_col] <= -0.5)
)
&
(Final_pval[pval_col] <= 0.05)
]
genes_this_drug = set(df_sub['Gene_Symbol']) # make sure you use gene symbols
if 'Core_Genes' in globals():
genes_this_drug.difference_update(set(Core_Genes))
peripheral_GSEApy[drug_name] = list(genes_this_drug)
gsea_results = {}
for drug, gene_list in peripheral_GSEApy.items():
if not gene_list:
print(f"No genes for {drug}, skipping GO:BP enrichment")
continue
print(f"Running Enrichr GO:BP enrichment for {drug} ({len(gene_list)} genes)...")
try:
enr = gp.enrichr(
gene_list=gene_list,
# gene_sets="KEGG_2021_Human", # Use the latest GO:BP database
gene_sets=["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human", 'GO_Biological_Process_2025', 'BioPlanet_2019', 'GO_Molecular_Function_2025'],
# gene_sets= ["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human"],
organism="Human",
outdir=None, # donāt write to disk
cutoff=0.05
)
df_res = enr.results
df_res['drug'] = drug
gsea_results[drug] = df_res
print(f" Found {len(df_res)} enriched terms for {drug}")
except Exception as e:
print(f" Error processing {drug}: {e}")
all_gsea = [df for df in gsea_results.values() if not df.empty]
if all_gsea:
combined_gsea = pd.concat(all_gsea, ignore_index=True)
print(f"Total enriched terms across drugs: {combined_gsea['Term'].nunique()}")
else:
combined_gsea = pd.DataFrame()
print("No enrichment terms found.")
combined_gsea.rename(columns={
'Term': 'GO_term',
'Adjusted P-value': 'p_adj',
'Overlap': 'Overlap',
'Genes': 'Genes'
}, inplace=True)
InĀ [608]:
# Helper function: convert a set of Entrez IDs to unique, sorted gene symbols
def ids_to_symbols(id_set, id2symbol):
symbols = [id2symbol.get(int(i)) for i in id_set if int(i) in id2symbol]
symbols = [s.strip() for s in symbols if isinstance(s, str) and s.strip()]
return sorted(set(symbols))
# Convert each gene set
core_symbols = ids_to_symbols(Core_Genes, id2symbol)
multidrug_symbols = ids_to_symbols(multidrug_genes, id2symbol)
nonresp_symbols = ids_to_symbols(non_respondent_genes, id2symbol)
silenced_symbols = ids_to_symbols(silenced_genes, id2symbol)
# Put them into other_GSEApy
other_GSEApy = {
"Core": core_symbols,
"Multidrug": multidrug_symbols,
"Non-Respondent": nonresp_symbols,
"Silenced ORFs": silenced_symbols,
}
# Optional sanity check
for k, v in other_GSEApy.items():
print(f"{k}: {len(v)} mapped gene symbols")
Core: 197 mapped gene symbols Multidrug: 6556 mapped gene symbols Non-Respondent: 2849 mapped gene symbols Silenced ORFs: 1908 mapped gene symbols
InĀ [651]:
import gseapy as gp
# --- Step 1: Filter peripheral genes per drug (no exclusivity) ---
peripheral_GSEApy = {}
for drug_name in drug_name_list:
nan_col = f'nan_filter_{drug_name}'
fc_col = f'FC_{drug_name}'
pval_col = f'ovp3_{drug_name}'
df_sub = Final_pval[
(Final_pval[nan_col] == 1) &
((Final_pval[fc_col] >= 0.5)
|
(Final_pval[fc_col] <= -0.5)
)
&
(Final_pval[pval_col] <= 0.05)
]
genes_this_drug = set(df_sub['Gene_Symbol']) # make sure you use gene symbols
if 'Core_Genes' in globals():
genes_this_drug.difference_update(set(Core_Genes))
peripheral_GSEApy[drug_name] = list(genes_this_drug)
# peripheral_GSEApy = {}
# core = set(Core_Genes) if 'Core_Genes' in globals() else set()
# for drug_name in drug_name_list:
# ids = peripheral_dfs_select.get(drug_name, [])
# # Convert IDs to symbols, skipping unmapped ones
# symbols = [id2symbol.get(int(i)) for i in ids if int(i) in id2symbol]
# # Remove None values and strip whitespace
# symbols = [s.strip() for s in symbols if isinstance(s, str) and s.strip()]
# # Remove core genes if needed
# symbols = sorted(set(symbols) - core)
# peripheral_GSEApy[drug_name] = symbols
# print("Peripheral gene counts per drug:")
# for drug, genes in peripheral_GSEApy.items():
# print(f"{drug}: {len(genes)} genes")
# --- Step 2: Run GO:BP enrichment with GSEApy (Enrichr) ---
gsea_results = {}
for drug, gene_list in peripheral_GSEApy.items():
if not gene_list:
print(f"No genes for {drug}, skipping GO:BP enrichment")
continue
print(f"Running Enrichr GO:BP enrichment for {drug} ({len(gene_list)} genes)...")
try:
enr = gp.enrichr(
gene_list=gene_list,
# gene_sets="KEGG_2021_Human", # Use the latest GO:BP database
gene_sets=["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human", 'GO_Biological_Process_2025', 'BioPlanet_2019', 'GO_Molecular_Function_2025'],
# gene_sets= ["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human"],
organism="Human",
outdir=None, # donāt write to disk
cutoff=0.05
)
df_res = enr.results
df_res['drug'] = drug
gsea_results[drug] = df_res
print(f" Found {len(df_res)} enriched terms for {drug}")
except Exception as e:
print(f" Error processing {drug}: {e}")
# --- Step 3: Combine results ---
all_gsea = [df for df in gsea_results.values() if not df.empty]
if all_gsea:
combined_gsea = pd.concat(all_gsea, ignore_index=True)
print(f"Total enriched terms across drugs: {combined_gsea['Term'].nunique()}")
else:
combined_gsea = pd.DataFrame()
print("No enrichment terms found.")
# --- Optional: Rename columns for consistency ---
combined_gsea.rename(columns={
'Term': 'GO_term',
'Adjusted P-value': 'p_adj',
'Overlap': 'Overlap',
'Genes': 'Genes'
}, inplace=True)
Running Enrichr GO:BP enrichment for Paclitaxel (1701 genes)... Found 8573 enriched terms for Paclitaxel Running Enrichr GO:BP enrichment for Cisplatin (1907 genes)... Found 8692 enriched terms for Cisplatin Running Enrichr GO:BP enrichment for TFT (1751 genes)... Found 8651 enriched terms for TFT Running Enrichr GO:BP enrichment for FdU (1969 genes)... Found 8790 enriched terms for FdU Running Enrichr GO:BP enrichment for EdU (866 genes)... Found 6422 enriched terms for EdU Running Enrichr GO:BP enrichment for Doxorubicin (2099 genes)... Found 9185 enriched terms for Doxorubicin Running Enrichr GO:BP enrichment for 5FU (1900 genes)... Found 8820 enriched terms for 5FU Running Enrichr GO:BP enrichment for Carboplatin (1957 genes)... Found 8820 enriched terms for Carboplatin Running Enrichr GO:BP enrichment for Bleomycin (1877 genes)... Found 8779 enriched terms for Bleomycin Running Enrichr GO:BP enrichment for Etoposide (1594 genes)... Found 8178 enriched terms for Etoposide Running Enrichr GO:BP enrichment for MitomycinC (1541 genes)... Found 8324 enriched terms for MitomycinC Running Enrichr GO:BP enrichment for Carmustine (1751 genes)... Found 8535 enriched terms for Carmustine Running Enrichr GO:BP enrichment for Irinotecan (1952 genes)... Found 8614 enriched terms for Irinotecan Running Enrichr GO:BP enrichment for 6mercaptopurine (785 genes)... Found 6218 enriched terms for 6mercaptopurine Running Enrichr GO:BP enrichment for Vinblastine (1390 genes)... Found 7614 enriched terms for Vinblastine Running Enrichr GO:BP enrichment for TAS102 (2182 genes)... Found 9316 enriched terms for TAS102 Total enriched terms across drugs: 11055
InĀ [820]:
other_gsea_results = {}
# --- Step 2: Run GO:BP enrichment with GSEApy (Enrichr) ---
for category, gene_list in other_GSEApy.items():
if not gene_list:
print(f"No genes for {category}, skipping GO:BP enrichment")
continue
print(f"Running Enrichr GO:BP enrichment for {category} ({len(gene_list)} genes)...")
try:
enr = gp.enrichr(
gene_list=gene_list,
# you can include multiple libraries here
gene_sets=["KEGG_2021_Human", "Reactome_Pathways_2024", "WikiPathways_2024_Human", 'GO_Biological_Process_2025', 'GO_Molecular_Function_2025'],
# gene_sets="KEGG_2021_Human",
organism="Human",
outdir=None,
cutoff=0.05
)
df_res = enr.results
df_res['category'] = category
other_gsea_results[category] = df_res
print(f" Found {len(df_res)} enriched terms for {category}")
except Exception as e:
print(f" Error processing {category}: {e}")
# --- Step 3: Combine results ---
all_other_gsea = [df for df in other_gsea_results.values() if not df.empty]
if all_other_gsea:
combined_other_gsea = pd.concat(all_other_gsea, ignore_index=True)
print(f"Total enriched terms across categories: {combined_other_gsea['Term'].nunique()}")
else:
combined_other_gsea = pd.DataFrame()
print("No enrichment terms found for other_GSEApy.")
# --- Optional: Rename columns for consistency ---
combined_other_gsea.rename(columns={
'Term': 'GO_term',
'Adjusted P-value': 'p_adj',
'Overlap': 'Overlap',
'Genes': 'Genes'
}, inplace=True)
Running Enrichr GO:BP enrichment for Core (197 genes)... Found 1897 enriched terms for Core Running Enrichr GO:BP enrichment for Multidrug (6556 genes)... Found 9516 enriched terms for Multidrug Running Enrichr GO:BP enrichment for Non-Respondent (2849 genes)... Found 8053 enriched terms for Non-Respondent Running Enrichr GO:BP enrichment for Silenced ORFs (1908 genes)... Found 7590 enriched terms for Silenced ORFs Total enriched terms across categories: 9703
InĀ [881]:
categorical_gsea_results = {}
# --- Step 2: Run GO:BP enrichment with GSEApy (Enrichr) ---
for category, gene_list in categorical_GSEApy.items():
if not gene_list:
print(f"No genes for {category}, skipping GO:BP enrichment")
continue
print(f"Running Enrichr GO:BP enrichment for {category} ({len(gene_list)} genes)...")
try:
enr = gp.enrichr(
gene_list=gene_list,
gene_sets=[
"KEGG_2021_Human",
"Reactome_Pathways_2024",
"WikiPathways_2024_Human",
"GO_Biological_Process_2025",
"GO_Molecular_Function_2025"
],
organism="Human",
outdir=None,
cutoff=0.05
)
df_res = enr.results
df_res['category'] = category
categorical_gsea_results[category] = df_res
print(f" Found {len(df_res)} enriched terms for {category}")
except Exception as e:
print(f" Error processing {category}: {e}")
# --- Step 3: Combine results ---
all_categorical_gsea = [df for df in categorical_gsea_results.values() if not df.empty]
if all_categorical_gsea:
combined_categorical_gsea = pd.concat(all_categorical_gsea, ignore_index=True)
print(f"Total enriched terms across categories: {combined_categorical_gsea['Term'].nunique()}")
else:
combined_categorical_gsea = pd.DataFrame()
print("No enrichment terms found for categorical_GSEApy.")
# --- Optional: Rename columns for consistency ---
combined_categorical_gsea.rename(columns={
'Term': 'GO_term',
'Adjusted P-value': 'p_adj',
'Overlap': 'Overlap',
'Genes': 'Genes'
}, inplace=True)
Running Enrichr GO:BP enrichment for Antimetabolite (231 genes)... Found 2693 enriched terms for Antimetabolite Running Enrichr GO:BP enrichment for DNA cross linking agent (247 genes)... Found 2937 enriched terms for DNA cross linking agent Running Enrichr GO:BP enrichment for DNA strand break agent (342 genes)... Found 3406 enriched terms for DNA strand break agent Running Enrichr GO:BP enrichment for Microtubule inhibitor (252 genes)... Found 2551 enriched terms for Microtubule inhibitor Total enriched terms across categories: 6001
InĀ [1018]:
combined_categorical_gsea
Out[1018]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | category | gene_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KEGG_2021_Human | Phenylalanine metabolism | 3/17 | 0.000918 | 0.190844 | 0 | 0 | 18.566729 | 129.852632 | ALDH3B2;AOC2;HPD | Antimetabolite | 3 |
| 1 | KEGG_2021_Human | Tyrosine metabolism | 3/36 | 0.008201 | 0.498437 | 0 | 0 | 7.869219 | 37.800188 | ALDH3B2;AOC2;HPD | Antimetabolite | 3 |
| 2 | KEGG_2021_Human | Pathways of neurodegeneration | 12/475 | 0.009466 | 0.498437 | 0 | 0 | 2.284801 | 10.647343 | FIG4;ERN1;TUBB8;PSMD6;KLC3;ATP2A3;UQCRFS1;WNT9... | Antimetabolite | 12 |
| 3 | KEGG_2021_Human | Various types of N-glycan biosynthesis | 3/39 | 0.010235 | 0.498437 | 0 | 0 | 7.212354 | 33.046912 | MAN2A2;HEXB;STT3B | Antimetabolite | 3 |
| 4 | KEGG_2021_Human | Protein processing in endoplasmic reticulum | 6/171 | 0.014637 | 0.498437 | 0 | 0 | 3.168323 | 13.383670 | ERN1;SSR2;SSR1;STT3B;PDIA4;TXNDC5 | Antimetabolite | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11582 | GO_Molecular_Function_2025 | Tubulin Binding (GO:0015631) | 2/326 | 0.919016 | 0.934593 | 0 | 0 | 0.479605 | 0.040503 | GAS8;DLGAP5 | Microtubule inhibitor | 2 |
| 11583 | GO_Molecular_Function_2025 | Protein Heterodimerization Activity (GO:0046982) | 1/216 | 0.936314 | 0.948166 | 0 | 0 | 0.361957 | 0.023818 | PPP2CA | Microtubule inhibitor | 1 |
| 11584 | GO_Molecular_Function_2025 | Cadherin Binding (GO:0045296) | 1/317 | 0.982608 | 0.990865 | 0 | 0 | 0.244995 | 0.004298 | CAPZB | Microtubule inhibitor | 1 |
| 11585 | GO_Molecular_Function_2025 | Olfactory Receptor Activity (GO:0004984) | 1/379 | 0.992185 | 0.994195 | 0 | 0 | 0.204157 | 0.001602 | OR2J3 | Microtubule inhibitor | 1 |
| 11586 | GO_Molecular_Function_2025 | Anion Binding (GO:0043168) | 1/402 | 0.994195 | 0.994195 | 0 | 0 | 0.192219 | 0.001119 | GSR | Microtubule inhibitor | 1 |
11587 rows Ć 12 columns
InĀ [Ā ]:
# Count the number of genes in 'Genes' column by counting semicolons + 1
combined_categorical_gsea['gene_count'] = combined_categorical_gsea['Genes'].str.count(';') + 1
# Keep only rows with at least 3 genes
filtered_cetegorical_gsea = combined_categorical_gsea[combined_categorical_gsea['gene_count'] >= 3].copy()
# # Optionally drop the temporary 'gene_count' column if no longer needed
filtered_categorical_gsea = filtered_cetegorical_gsea[filtered_cetegorical_gsea['P-value'] <= 0.05].copy()
InĀ [890]:
filtered_categorical_gsea
Out[890]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | category | gene_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KEGG_2021_Human | Phenylalanine metabolism | 3/17 | 0.000918 | 0.190844 | 0 | 0 | 18.566729 | 129.852632 | ALDH3B2;AOC2;HPD | Antimetabolite | 3 |
| 1 | KEGG_2021_Human | Tyrosine metabolism | 3/36 | 0.008201 | 0.498437 | 0 | 0 | 7.869219 | 37.800188 | ALDH3B2;AOC2;HPD | Antimetabolite | 3 |
| 2 | KEGG_2021_Human | Pathways of neurodegeneration | 12/475 | 0.009466 | 0.498437 | 0 | 0 | 2.284801 | 10.647343 | FIG4;ERN1;TUBB8;PSMD6;KLC3;ATP2A3;UQCRFS1;WNT9... | Antimetabolite | 12 |
| 3 | KEGG_2021_Human | Various types of N-glycan biosynthesis | 3/39 | 0.010235 | 0.498437 | 0 | 0 | 7.212354 | 33.046912 | MAN2A2;HEXB;STT3B | Antimetabolite | 3 |
| 4 | KEGG_2021_Human | Protein processing in endoplasmic reticulum | 6/171 | 0.014637 | 0.498437 | 0 | 0 | 3.168323 | 13.383670 | ERN1;SSR2;SSR1;STT3B;PDIA4;TXNDC5 | Antimetabolite | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11355 | GO_Molecular_Function_2025 | Sequence-Specific Double-Stranded DNA Binding ... | 15/706 | 0.034477 | 0.507452 | 0 | 0 | 1.745498 | 5.877905 | OLIG3;CEBPE;RORC;DUX4;POU3F4;ELK4;SFPQ;IRF4;ZN... | Microtubule inhibitor | 15 |
| 11356 | GO_Molecular_Function_2025 | RNA Polymerase II Transcription Regulatory Reg... | 23/1236 | 0.039813 | 0.507452 | 0 | 0 | 1.534702 | 4.947199 | ZNF100;CEBPE;ZBTB16;RORC;ZNF3;ZNF25;DUX4;POU3F... | Microtubule inhibitor | 23 |
| 11357 | GO_Molecular_Function_2025 | Serine-Type Peptidase Activity (GO:0008236) | 5/149 | 0.040583 | 0.507452 | 0 | 0 | 2.755848 | 8.830882 | CLPP;HTRA4;PLAT;TTF2;PRSS12 | Microtubule inhibitor | 5 |
| 11358 | GO_Molecular_Function_2025 | RNA Polymerase II Cis-Regulatory Region Sequen... | 20/1054 | 0.045036 | 0.507452 | 0 | 0 | 1.560228 | 4.837152 | ZNF100;OLIG3;CEBPE;ZBTB16;RORC;DUX4;POU3F4;ELK... | Microtubule inhibitor | 20 |
| 11359 | GO_Molecular_Function_2025 | Guanyl-Nucleotide Exchange Factor Activity (GO... | 6/206 | 0.046696 | 0.507452 | 0 | 0 | 2.383902 | 7.304500 | PLEKHG3;PREX1;RABGEF1;ELMO1;ARHGEF4;FGD2 | Microtubule inhibitor | 6 |
333 rows Ć 12 columns
InĀ [887]:
# Count the number of genes in 'Genes' column by counting semicolons + 1
combined_other_gsea['gene_count'] = combined_other_gsea['Genes'].str.count(';') + 1
# Keep only rows with at least 3 genes
filtered_other_gsea = combined_other_gsea[combined_other_gsea['gene_count'] >= 3].copy()
# # Optionally drop the temporary 'gene_count' column if no longer needed
# filtered_gsea.drop(columns=['gene_count'], inplace=True)
filtered_other_gsea = filtered_other_gsea[filtered_other_gsea['P-value'] <= 0.05].copy()
InĀ [973]:
filtered_other_gsea
Out[973]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | category | gene_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KEGG_2021_Human | Human T-cell leukemia virus 1 infection | 9/219 | 0.000333 | 0.046568 | 0 | 0 | 4.466489 | 35.769786 | FOSL1;EGR1;MSX2;MYC;E2F1;TBPL1;SLC25A31;MSX1;C... | Core | 9 |
| 1 | KEGG_2021_Human | Maturity onset diabetes of the young | 3/26 | 0.002071 | 0.144949 | 0 | 0 | 13.298969 | 82.185848 | HNF4A;GCK;FOXA3 | Core | 3 |
| 2 | KEGG_2021_Human | Olfactory transduction | 11/440 | 0.004375 | 0.204187 | 0 | 0 | 2.670802 | 14.507133 | OR5M11;OR2C3;OR51E1;OR2AE1;OR2G3;OR6K2;OR1F1;O... | Core | 11 |
| 3 | KEGG_2021_Human | Chemical carcinogenesis | 7/239 | 0.009653 | 0.250869 | 0 | 0 | 3.107917 | 14.422219 | FGF8;KLF5;MYC;MGST3;E2F1;FGF20;CREB5 | Core | 7 |
| 4 | KEGG_2021_Human | Transcriptional misregulation in cancer | 6/192 | 0.012044 | 0.250869 | 0 | 0 | 3.313123 | 14.641432 | HOXA9;TLX3;PAX7;MYC;DDIT3;ETV4 | Core | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26308 | GO_Molecular_Function_2025 | Flavin Adenine Dinucleotide Binding (GO:0050660) | 10/58 | 0.046237 | 0.539071 | 0 | 0 | 1.980594 | 6.088315 | GCDH;AIFM1;MAOB;KDM1B;TXNRD1;ACOX3;KMO;SDHA;MT... | Silenced ORFs | 10 |
| 26309 | GO_Molecular_Function_2025 | Benzodiazepine Receptor Activity (GO:0008503) | 3/9 | 0.046962 | 0.539071 | 0 | 0 | 4.746982 | 14.518244 | GABRA6;GABRE;GABRG3 | Silenced ORFs | 3 |
| 26310 | GO_Molecular_Function_2025 | Tubulin-Glutamic Acid Ligase Activity (GO:0070... | 3/9 | 0.046962 | 0.539071 | 0 | 0 | 4.746982 | 14.518244 | TTLL6;TTLL10;TTLL2 | Silenced ORFs | 3 |
| 26311 | GO_Molecular_Function_2025 | Type II Transforming Growth Factor Beta Recept... | 3/9 | 0.046962 | 0.539071 | 0 | 0 | 4.746982 | 14.518244 | TGFB2;TGFB3;TGFBR1 | Silenced ORFs | 3 |
| 26312 | GO_Molecular_Function_2025 | Phospholipase Activator Activity (GO:0016004) | 4/15 | 0.047930 | 0.541468 | 0 | 0 | 3.453209 | 10.490882 | PDGFRB;BTK;FYN;ARHGAP6 | Silenced ORFs | 4 |
1536 rows Ć 12 columns
InĀ [892]:
# Combine row-wise
combined_filtered_gsea = pd.concat(
[filtered_categorical_gsea, filtered_other_gsea],
ignore_index=True
)
InĀ [894]:
import re
# Remove GO IDs in parentheses and trailing WP IDs
def clean_term(term):
term = re.sub(r"\s*\(GO:\d+\)$", "", term) # Remove GO IDs in parentheses
term = re.sub(r"\s+WP\d+$", "", term) # Remove trailing WP IDs
return term.strip()
# filtered_other_gsea["GO_term"] = filtered_other_gsea["GO_term"].apply(clean_term)
combined_filtered_gsea["GO_term"] = combined_filtered_gsea["GO_term"].apply(clean_term)
InĀ [1019]:
combined_filtered_gsea
Out[1019]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | category | gene_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KEGG_2021_Human | Phenylalanine metabolism | 3/17 | 0.000918 | 0.190844 | 0 | 0 | 18.566729 | 129.852632 | ALDH3B2;AOC2;HPD | Antimetabolite | 3 |
| 1 | KEGG_2021_Human | Tyrosine metabolism | 3/36 | 0.008201 | 0.498437 | 0 | 0 | 7.869219 | 37.800188 | ALDH3B2;AOC2;HPD | Antimetabolite | 3 |
| 2 | KEGG_2021_Human | Pathways of neurodegeneration | 12/475 | 0.009466 | 0.498437 | 0 | 0 | 2.284801 | 10.647343 | FIG4;ERN1;TUBB8;PSMD6;KLC3;ATP2A3;UQCRFS1;WNT9... | Antimetabolite | 12 |
| 3 | KEGG_2021_Human | Various types of N-glycan biosynthesis | 3/39 | 0.010235 | 0.498437 | 0 | 0 | 7.212354 | 33.046912 | MAN2A2;HEXB;STT3B | Antimetabolite | 3 |
| 4 | KEGG_2021_Human | Protein processing in endoplasmic reticulum | 6/171 | 0.014637 | 0.498437 | 0 | 0 | 3.168323 | 13.383670 | ERN1;SSR2;SSR1;STT3B;PDIA4;TXNDC5 | Antimetabolite | 6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1864 | GO_Molecular_Function_2025 | Flavin Adenine Dinucleotide Binding | 10/58 | 0.046237 | 0.539071 | 0 | 0 | 1.980594 | 6.088315 | GCDH;AIFM1;MAOB;KDM1B;TXNRD1;ACOX3;KMO;SDHA;MT... | Silenced ORFs | 10 |
| 1865 | GO_Molecular_Function_2025 | Benzodiazepine Receptor Activity | 3/9 | 0.046962 | 0.539071 | 0 | 0 | 4.746982 | 14.518244 | GABRA6;GABRE;GABRG3 | Silenced ORFs | 3 |
| 1866 | GO_Molecular_Function_2025 | Tubulin-Glutamic Acid Ligase Activity | 3/9 | 0.046962 | 0.539071 | 0 | 0 | 4.746982 | 14.518244 | TTLL6;TTLL10;TTLL2 | Silenced ORFs | 3 |
| 1867 | GO_Molecular_Function_2025 | Type II Transforming Growth Factor Beta Recept... | 3/9 | 0.046962 | 0.539071 | 0 | 0 | 4.746982 | 14.518244 | TGFB2;TGFB3;TGFBR1 | Silenced ORFs | 3 |
| 1868 | GO_Molecular_Function_2025 | Phospholipase Activator Activity | 4/15 | 0.047930 | 0.541468 | 0 | 0 | 3.453209 | 10.490882 | PDGFRB;BTK;FYN;ARHGAP6 | Silenced ORFs | 4 |
1869 rows Ć 12 columns
InĀ [1006]:
# Define keywords of interest
keywords = ["Transcriptional misregulation in cancer",
"TGF-beta signaling pathway",
"SNARE interactions in vesicular transport",
"p53 signaling pathway", "Phagosome", "Ferroptosis",
"Biosynthesis of unsaturated fatty acids",
"Tight junction",
"AMPK signaling pathway", "PPAR signaling pathway",
# "RNA Polymerase II Transcription Termination",
"Regulation of DNA Damage Response, Signal Transduction by p53 Class Mediator",
"Positive Regulation of Cell Cycle",
"Wnt Signaling Pathway",
"Pyroptosis",
"PI3K AKT Signaling in Cancer",
"PI3K-Akt signaling pathway",
"Toll-like Receptor Signaling Pathway",
# "Regulation of ERK1 and ERK2 Cascade",
"Positive regulation of Autophagy",
# "Cytochrome P450 pathway",
# "MAPK signaling pathway",
"MAP Kinase Activation",
"CREB phosphorylation",
"Protein kinase Binding",
# "Regulation of Autophagy",
"Hippo signaling pathway",
"Activation of BH3-only proteins",
"Intrinsic Pathway for Apoptosis",
"Regulation of Double-Strand Break Repair",
"Positive Regulation of Cell Cycle",
"Calcium signaling Pathway",
"GnRH signaling pathway",
"Toll-like receptor signaling pathway",
"RAF activation",
"Purine Ribonulcleotide Biosynthetic Process",
"Regulated Necrosis"
]
# # Filter terms that contain any of the keywords (case insensitive)
# relevant_gsea = filtered_other_gsea[
# filtered_other_gsea["GO_term"].str.contains("|".join(keywords), case=False, na=False)
# ]
# relevant_gsea = filtered_other_gsea[
# filtered_other_gsea["GO_term"].isin(keywords)
# ]
relevant_gsea = combined_filtered_gsea[
combined_filtered_gsea["GO_term"].isin(keywords)
]
InĀ [1008]:
relevant_gsea
Out[1008]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | category | gene_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | KEGG_2021_Human | PI3K-Akt signaling pathway | 9/354 | 0.022288 | 0.498437 | 0 | 0 | 2.282491 | 8.681976 | COL2A1;EFNA2;LPAR6;PKN2;ITGA6;SOS2;YWHAZ;TLR2;... | Antimetabolite | 9 |
| 65 | KEGG_2021_Human | Hippo signaling pathway | 8/163 | 0.000971 | 0.095598 | 0 | 0 | 4.232258 | 29.361982 | LATS1;BMP2;DLG3;YWHAQ;YWHAB;TP53BP2;WNT9B;ACTG1 | DNA cross linking agent | 8 |
| 69 | KEGG_2021_Human | PI3K-Akt signaling pathway | 9/354 | 0.032351 | 0.653581 | 0 | 0 | 2.127293 | 7.299007 | MAP2K1;EFNA3;YWHAQ;YWHAB;ITGA2;F2R;PRKCA;ITGA5... | DNA cross linking agent | 9 |
| 100 | Reactome_Pathways_2024 | Intrinsic Pathway for Apoptosis | 3/55 | 0.030494 | 0.523092 | 0 | 0 | 4.658181 | 16.258082 | YWHAQ;YWHAB;TP53BP2 | DNA cross linking agent | 3 |
| 128 | GO_Biological_Process_2025 | Regulation of Double-Strand Break Repair | 6/91 | 0.000927 | 0.258265 | 0 | 0 | 5.760703 | 40.228089 | ING3;KDM1A;SPIRE2;DMAP1;FOXM1;BRD7 | DNA cross linking agent | 6 |
| 145 | GO_Biological_Process_2025 | Positive Regulation of Cell Cycle | 4/60 | 0.006445 | 0.378306 | 0 | 0 | 5.789830 | 29.206633 | FOXA1;TAL1;PRKCA;AURKA | DNA cross linking agent | 4 |
| 208 | KEGG_2021_Human | GnRH signaling pathway | 5/93 | 0.021734 | 0.668100 | 0 | 0 | 3.299501 | 12.633427 | CAMK2D;JMJD7-PLA2G4B;GNAQ;GNRH2;PRKACB | DNA strand break agent | 5 |
| 212 | KEGG_2021_Human | Toll-like receptor signaling pathway | 5/104 | 0.033121 | 0.668100 | 0 | 0 | 2.931241 | 9.988457 | CD40;IFNA7;CCL4;MAP3K8;FADD | DNA strand break agent | 5 |
| 273 | KEGG_2021_Human | TGF-beta signaling pathway | 5/94 | 0.006747 | 0.950090 | 0 | 0 | 4.471410 | 22.350815 | PPP2CA;E2F4;BMPR1B;RHOA;BMP5 | Microtubule inhibitor | 5 |
| 276 | KEGG_2021_Human | Transcriptional misregulation in cancer | 6/192 | 0.035084 | 0.950090 | 0 | 0 | 2.565172 | 8.593333 | ELK4;CEBPE;ZBTB16;TSPAN7;PLAT;KDM6A | Microtubule inhibitor | 6 |
| 337 | KEGG_2021_Human | Transcriptional misregulation in cancer | 6/192 | 0.012044 | 0.250869 | 0 | 0 | 3.313123 | 14.641432 | HOXA9;TLX3;PAX7;MYC;DDIT3;ETV4 | Core | 6 |
| 339 | KEGG_2021_Human | TGF-beta signaling pathway | 4/94 | 0.013992 | 0.250869 | 0 | 0 | 4.539551 | 19.380511 | TFDP1;MYC;PITX2;RGMA | Core | 4 |
| 479 | GO_Biological_Process_2025 | Positive Regulation of Cell Cycle | 3/60 | 0.021360 | 0.191283 | 0 | 0 | 5.357027 | 20.604474 | TCF7L1;MSX1;MEIS2 | Core | 3 |
| 489 | GO_Biological_Process_2025 | Wnt Signaling Pathway | 3/83 | 0.048751 | 0.291727 | 0 | 0 | 3.812436 | 11.517512 | TCF7L1;SNAI1;GATA3 | Core | 3 |
| 507 | KEGG_2021_Human | SNARE interactions in vesicular transport | 19/33 | 0.002895 | 0.307884 | 0 | 0 | 2.788195 | 16.295910 | STX17;GOSR2;GOSR1;SNAP23;STX18;STX7;STX1B;STX6... | Multidrug | 19 |
| 508 | KEGG_2021_Human | p53 signaling pathway | 35/73 | 0.004970 | 0.317120 | 0 | 0 | 1.893518 | 10.043929 | CDKN1A;CD82;EI24;PPM1D;RCHY1;BBC3;CCND3;CASP8;... | Multidrug | 35 |
| 514 | KEGG_2021_Human | TGF-beta signaling pathway | 41/94 | 0.017958 | 0.572849 | 0 | 0 | 1.590035 | 6.391524 | BMPR2;AMHR2;ACVR1B;LTBP1;PPP2CB;ACVR1C;PPP2R1A... | Multidrug | 41 |
| 523 | KEGG_2021_Human | Phagosome | 60/152 | 0.048190 | 0.737977 | 0 | 0 | 1.340490 | 4.065178 | ITGAM;TFRC;ITGB5;NCF2;ITGB2;TCIRG1;MPO;CTSS;FC... | Multidrug | 60 |
| 525 | KEGG_2021_Human | Ferroptosis | 19/41 | 0.048582 | 0.737977 | 0 | 0 | 1.773249 | 5.363212 | PRNP;TFRC;GPX4;ALOX15;SLC40A1;ACSL6;ACSL5;CYBB... | Multidrug | 19 |
| 539 | Reactome_Pathways_2024 | Intrinsic Pathway for Apoptosis | 27/55 | 0.008696 | 0.724412 | 0 | 0 | 1.981445 | 9.401659 | AVEN;DIABLO;APIP;UACA;BBC3;CASP8;AKT2;C1QBP;AK... | Multidrug | 27 |
| 548 | Reactome_Pathways_2024 | Pyroptosis | 15/27 | 0.012153 | 0.724412 | 0 | 0 | 2.566886 | 11.320440 | GSDMD;HMGB1;IL1B;IRF1;CASP4;CHMP2B;IRF2;CASP1;... | Multidrug | 15 |
| 569 | Reactome_Pathways_2024 | PI3K AKT Signaling in Cancer | 47/110 | 0.018145 | 0.724412 | 0 | 0 | 1.533669 | 6.149036 | CD86;CDKN1A;GSK3A;IRS1;SRC;CD80;MAPKAP1;PDGFB;... | Multidrug | 47 |
| 580 | Reactome_Pathways_2024 | Regulated Necrosis | 28/61 | 0.022122 | 0.724412 | 0 | 0 | 1.743111 | 6.643335 | HMGB1;SDCBP;CASP8;UBB;CASP4;CASP1;RIPK1;BAK1;R... | Multidrug | 28 |
| 888 | GO_Biological_Process_2025 | Wnt Signaling Pathway | 35/83 | 0.045723 | 0.999980 | 0 | 0 | 1.497917 | 4.621308 | LEF1;TCF7;DIXDC1;WNT8A;KLHL12;WNT6;PORCN;FRAT2... | Multidrug | 35 |
| 950 | KEGG_2021_Human | Biosynthesis of unsaturated fatty acids | 8/27 | 0.030155 | 0.999994 | 0 | 0 | 2.539061 | 8.890254 | ACOT7;ELOVL5;ELOVL3;SCD5;ACOT2;ACOT1;ELOVL6;ACOX3 | Non-Respondent | 8 |
| 951 | KEGG_2021_Human | Tight junction | 33/169 | 0.035270 | 0.999994 | 0 | 0 | 1.466136 | 4.903833 | IGSF5;ARPC5L;PRKAG2;PARD6G;CD1D;F11R;CD1B;AMOT... | Non-Respondent | 33 |
| 1176 | KEGG_2021_Human | AMPK signaling pathway | 24/120 | 0.000365 | 0.056282 | 0 | 0 | 2.388004 | 18.899482 | PFKFB1;CPT1A;PFKFB3;TSC2;PIK3R3;PRKAG2;PPP2R5A... | Silenced ORFs | 24 |
| 1220 | KEGG_2021_Human | PPAR signaling pathway | 12/74 | 0.046737 | 0.308869 | 0 | 0 | 1.840547 | 5.638008 | CPT1A;SCD;ACSL6;ILK;PPARG;ACADM;CD36;HMGCS2;AC... | Silenced ORFs | 12 |
| 1272 | Reactome_Pathways_2024 | MAP Kinase Activation | 13/63 | 0.005824 | 0.192571 | 0 | 0 | 2.475420 | 12.738038 | ATF1;ATF2;MAP2K1;MEF2C;FBXW11;RIPK2;RPS6KA3;MA... | Silenced ORFs | 13 |
InĀ [1009]:
# Define the desired order
category_order = [
"Core",
"Antimetabolite",
"DNA cross linking agent",
"DNA strand break agent",
"Microtubule inhibitor",
"Multidrug",
"Non-Respondent",
"Silenced ORFs"
]
# Convert 'category' to categorical with specified order
relevant_gsea['category'] = pd.Categorical(
relevant_gsea['category'],
categories=category_order,
ordered=True
)
# Sort by the ordered category
relevant_gsea = relevant_gsea.sort_values('category').reset_index(drop=True)
/tmp/ipykernel_1548459/2888785418.py:14: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
InĀ [1010]:
relevant_gsea
Out[1010]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | category | gene_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GO_Biological_Process_2025 | Wnt Signaling Pathway | 3/83 | 0.048751 | 0.291727 | 0 | 0 | 3.812436 | 11.517512 | TCF7L1;SNAI1;GATA3 | Core | 3 |
| 1 | GO_Biological_Process_2025 | Positive Regulation of Cell Cycle | 3/60 | 0.021360 | 0.191283 | 0 | 0 | 5.357027 | 20.604474 | TCF7L1;MSX1;MEIS2 | Core | 3 |
| 2 | KEGG_2021_Human | Transcriptional misregulation in cancer | 6/192 | 0.012044 | 0.250869 | 0 | 0 | 3.313123 | 14.641432 | HOXA9;TLX3;PAX7;MYC;DDIT3;ETV4 | Core | 6 |
| 3 | KEGG_2021_Human | TGF-beta signaling pathway | 4/94 | 0.013992 | 0.250869 | 0 | 0 | 4.539551 | 19.380511 | TFDP1;MYC;PITX2;RGMA | Core | 4 |
| 4 | KEGG_2021_Human | PI3K-Akt signaling pathway | 9/354 | 0.022288 | 0.498437 | 0 | 0 | 2.282491 | 8.681976 | COL2A1;EFNA2;LPAR6;PKN2;ITGA6;SOS2;YWHAZ;TLR2;... | Antimetabolite | 9 |
| 5 | KEGG_2021_Human | Hippo signaling pathway | 8/163 | 0.000971 | 0.095598 | 0 | 0 | 4.232258 | 29.361982 | LATS1;BMP2;DLG3;YWHAQ;YWHAB;TP53BP2;WNT9B;ACTG1 | DNA cross linking agent | 8 |
| 6 | KEGG_2021_Human | PI3K-Akt signaling pathway | 9/354 | 0.032351 | 0.653581 | 0 | 0 | 2.127293 | 7.299007 | MAP2K1;EFNA3;YWHAQ;YWHAB;ITGA2;F2R;PRKCA;ITGA5... | DNA cross linking agent | 9 |
| 7 | Reactome_Pathways_2024 | Intrinsic Pathway for Apoptosis | 3/55 | 0.030494 | 0.523092 | 0 | 0 | 4.658181 | 16.258082 | YWHAQ;YWHAB;TP53BP2 | DNA cross linking agent | 3 |
| 8 | GO_Biological_Process_2025 | Regulation of Double-Strand Break Repair | 6/91 | 0.000927 | 0.258265 | 0 | 0 | 5.760703 | 40.228089 | ING3;KDM1A;SPIRE2;DMAP1;FOXM1;BRD7 | DNA cross linking agent | 6 |
| 9 | GO_Biological_Process_2025 | Positive Regulation of Cell Cycle | 4/60 | 0.006445 | 0.378306 | 0 | 0 | 5.789830 | 29.206633 | FOXA1;TAL1;PRKCA;AURKA | DNA cross linking agent | 4 |
| 10 | KEGG_2021_Human | GnRH signaling pathway | 5/93 | 0.021734 | 0.668100 | 0 | 0 | 3.299501 | 12.633427 | CAMK2D;JMJD7-PLA2G4B;GNAQ;GNRH2;PRKACB | DNA strand break agent | 5 |
| 11 | KEGG_2021_Human | Toll-like receptor signaling pathway | 5/104 | 0.033121 | 0.668100 | 0 | 0 | 2.931241 | 9.988457 | CD40;IFNA7;CCL4;MAP3K8;FADD | DNA strand break agent | 5 |
| 12 | KEGG_2021_Human | Transcriptional misregulation in cancer | 6/192 | 0.035084 | 0.950090 | 0 | 0 | 2.565172 | 8.593333 | ELK4;CEBPE;ZBTB16;TSPAN7;PLAT;KDM6A | Microtubule inhibitor | 6 |
| 13 | KEGG_2021_Human | TGF-beta signaling pathway | 5/94 | 0.006747 | 0.950090 | 0 | 0 | 4.471410 | 22.350815 | PPP2CA;E2F4;BMPR1B;RHOA;BMP5 | Microtubule inhibitor | 5 |
| 14 | GO_Biological_Process_2025 | Wnt Signaling Pathway | 35/83 | 0.045723 | 0.999980 | 0 | 0 | 1.497917 | 4.621308 | LEF1;TCF7;DIXDC1;WNT8A;KLHL12;WNT6;PORCN;FRAT2... | Multidrug | 35 |
| 15 | Reactome_Pathways_2024 | Regulated Necrosis | 28/61 | 0.022122 | 0.724412 | 0 | 0 | 1.743111 | 6.643335 | HMGB1;SDCBP;CASP8;UBB;CASP4;CASP1;RIPK1;BAK1;R... | Multidrug | 28 |
| 16 | Reactome_Pathways_2024 | PI3K AKT Signaling in Cancer | 47/110 | 0.018145 | 0.724412 | 0 | 0 | 1.533669 | 6.149036 | CD86;CDKN1A;GSK3A;IRS1;SRC;CD80;MAPKAP1;PDGFB;... | Multidrug | 47 |
| 17 | Reactome_Pathways_2024 | Pyroptosis | 15/27 | 0.012153 | 0.724412 | 0 | 0 | 2.566886 | 11.320440 | GSDMD;HMGB1;IL1B;IRF1;CASP4;CHMP2B;IRF2;CASP1;... | Multidrug | 15 |
| 18 | Reactome_Pathways_2024 | Intrinsic Pathway for Apoptosis | 27/55 | 0.008696 | 0.724412 | 0 | 0 | 1.981445 | 9.401659 | AVEN;DIABLO;APIP;UACA;BBC3;CASP8;AKT2;C1QBP;AK... | Multidrug | 27 |
| 19 | KEGG_2021_Human | SNARE interactions in vesicular transport | 19/33 | 0.002895 | 0.307884 | 0 | 0 | 2.788195 | 16.295910 | STX17;GOSR2;GOSR1;SNAP23;STX18;STX7;STX1B;STX6... | Multidrug | 19 |
| 20 | KEGG_2021_Human | Phagosome | 60/152 | 0.048190 | 0.737977 | 0 | 0 | 1.340490 | 4.065178 | ITGAM;TFRC;ITGB5;NCF2;ITGB2;TCIRG1;MPO;CTSS;FC... | Multidrug | 60 |
| 21 | KEGG_2021_Human | TGF-beta signaling pathway | 41/94 | 0.017958 | 0.572849 | 0 | 0 | 1.590035 | 6.391524 | BMPR2;AMHR2;ACVR1B;LTBP1;PPP2CB;ACVR1C;PPP2R1A... | Multidrug | 41 |
| 22 | KEGG_2021_Human | p53 signaling pathway | 35/73 | 0.004970 | 0.317120 | 0 | 0 | 1.893518 | 10.043929 | CDKN1A;CD82;EI24;PPM1D;RCHY1;BBC3;CCND3;CASP8;... | Multidrug | 35 |
| 23 | KEGG_2021_Human | Ferroptosis | 19/41 | 0.048582 | 0.737977 | 0 | 0 | 1.773249 | 5.363212 | PRNP;TFRC;GPX4;ALOX15;SLC40A1;ACSL6;ACSL5;CYBB... | Multidrug | 19 |
| 24 | KEGG_2021_Human | Biosynthesis of unsaturated fatty acids | 8/27 | 0.030155 | 0.999994 | 0 | 0 | 2.539061 | 8.890254 | ACOT7;ELOVL5;ELOVL3;SCD5;ACOT2;ACOT1;ELOVL6;ACOX3 | Non-Respondent | 8 |
| 25 | KEGG_2021_Human | Tight junction | 33/169 | 0.035270 | 0.999994 | 0 | 0 | 1.466136 | 4.903833 | IGSF5;ARPC5L;PRKAG2;PARD6G;CD1D;F11R;CD1B;AMOT... | Non-Respondent | 33 |
| 26 | KEGG_2021_Human | PPAR signaling pathway | 12/74 | 0.046737 | 0.308869 | 0 | 0 | 1.840547 | 5.638008 | CPT1A;SCD;ACSL6;ILK;PPARG;ACADM;CD36;HMGCS2;AC... | Silenced ORFs | 12 |
| 27 | KEGG_2021_Human | AMPK signaling pathway | 24/120 | 0.000365 | 0.056282 | 0 | 0 | 2.388004 | 18.899482 | PFKFB1;CPT1A;PFKFB3;TSC2;PIK3R3;PRKAG2;PPP2R5A... | Silenced ORFs | 24 |
| 28 | Reactome_Pathways_2024 | MAP Kinase Activation | 13/63 | 0.005824 | 0.192571 | 0 | 0 | 2.475420 | 12.738038 | ATF1;ATF2;MAP2K1;MEF2C;FBXW11;RIPK2;RPS6KA3;MA... | Silenced ORFs | 13 |
InĀ [Ā ]:
from textwrap import wrap
import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1 import make_axes_locatable
# Ensure 'Combined Score' is float
relevant_gsea['Combined Score'] = pd.to_numeric(relevant_gsea['Combined Score'], errors='coerce')
relevant_gsea = relevant_gsea.dropna(subset=['Combined Score'])
relevant_gsea['Odds Ratio'] = relevant_gsea['Odds Ratio'].round().astype(int)
# Add log10 P-value
relevant_gsea['log10_P-value'] = -np.log10(relevant_gsea['P-value'])
# -----------------------------
# Wrap y-axis labels if too long
# -----------------------------
median_len = int(relevant_gsea['GO_term'].str.len().median())
median_len = 30
def wrap_label(label, max_len=median_len):
"""Split GO terms into 2 lines if longer than median length."""
if len(label) > max_len:
return "\n".join(wrap(label, max_len))
return label
relevant_gsea['GO_term_wrapped'] = relevant_gsea['GO_term'].apply(wrap_label)
# -----------------------------
# Count overlaps per GO term
# -----------------------------
# Add counts to labels (optional, can remove if not wanted)
go_counts = relevant_gsea.groupby('GO_term_wrapped')['category'].nunique().sort_values(ascending=True)
# Reorder categorical axis (descending by overlap)
relevant_gsea['GO_term_wrapped'] = pd.Categorical(
relevant_gsea['GO_term_wrapped'],
categories=go_counts.index,
ordered=False
)
# Plot
plt.figure(figsize=(25, 35
))
scatter = sns.scatterplot(
data=relevant_gsea,
x='category',
y='GO_term',
size='Odds Ratio',
hue='log10_P-value',
palette='RdPu',
sizes=(350, 2000),
edgecolor='black',
linewidth=1,
legend=False # We'll add custom legends
)
# Custom size legend (Odds Ratio)
size_values = np.linspace(relevant_gsea['Odds Ratio'].min(), relevant_gsea['Odds Ratio'].max(), 3)
markers = []
labels = []
for size in size_values:
markers.append(plt.scatter([], [],
s=(50 + (size - size_values.min()) /
(size_values.max() - size_values.min()) * (2000-350)),
color='gray', edgecolor='black'))
labels.append(f"{size:.2f}")
plt.legend(markers, labels, title='Odds Ratio', bbox_to_anchor=(1, 1), loc='upper left')
# Colorbar for -log10(P-value), smaller and moved
norm = mcolors.Normalize(vmin=relevant_gsea['log10_P-value'].min(),
vmax=relevant_gsea['log10_P-value'].max())
sm = plt.cm.ScalarMappable(cmap='RdPu', norm=norm)
sm.set_array([])
# Use fraction to control size and pad to move it
cbar = plt.colorbar(sm, ax=scatter, fraction=0.04, pad=0.05)
cbar.set_label('-log10(P-value)', rotation=270, labelpad=40)
# Aesthetics
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='both', which='major', width=1, length=10)
plt.ylabel('')
plt.xlabel('')
plt.grid(False)
plt.tight_layout()
scatter.set_ylim(len(relevant_gsea['GO_term_wrapped'].unique()) - 0.5, -0.5)
plt.savefig("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Dotplot_all.svg", format='svg', dpi=2000)
plt.show()
InĀ [961]:
# Count the number of genes in 'Genes' column by counting semicolons + 1
combined_gsea['gene_count'] = combined_gsea['Genes'].str.count(';') + 1
# Keep only rows with at least 3 genes
filtered_gsea = combined_gsea[combined_gsea['gene_count'] >= 3].copy()
# # Optionally drop the temporary 'gene_count' column if no longer needed
# filtered_gsea.drop(columns=['gene_count'], inplace=True)
filtered_gsea = filtered_gsea[filtered_gsea['P-value'] <= 0.05].copy()
InĀ [962]:
import re
# Remove GO IDs in parentheses and trailing WP IDs
def clean_term(term):
term = re.sub(r"\s*\(GO:\d+\)$", "", term) # Remove GO IDs in parentheses
term = re.sub(r"\s+WP\d+$", "", term) # Remove trailing WP IDs
return term.strip()
filtered_gsea["GO_term"] = filtered_gsea["GO_term"].apply(clean_term)
InĀ [963]:
filtered_gsea
Out[963]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | drug | gene_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KEGG_2021_Human | TGF-beta signaling pathway | 19/94 | 0.000308 | 0.095476 | 0 | 0 | 2.744796 | 22.192908 | TGIF1;TGFB2;INHBB;INHBA;ACVR2B;BMP7;SMAD5;DCN;... | Paclitaxel | 19 |
| 1 | KEGG_2021_Human | RNA degradation | 14/79 | 0.006392 | 0.711796 | 0 | 0 | 2.327992 | 11.762601 | LSM1;LSM5;TOB1;PATL1;HSPD1;LSM2;EDC4;CNOT4;LSM... | Paclitaxel | 14 |
| 2 | KEGG_2021_Human | PI3K-Akt signaling pathway | 44/354 | 0.007054 | 0.711796 | 0 | 0 | 1.540904 | 7.633904 | PHLPP1;TNXB;CHRM1;CSF1;LAMC3;IFNA2;PDGFA;PIK3C... | Paclitaxel | 44 |
| 3 | KEGG_2021_Human | Hematopoietic cell lineage | 16/99 | 0.009184 | 0.711796 | 0 | 0 | 2.083987 | 9.774403 | CSF1;GP1BB;ITGA2;FLT3LG;CD3G;DNTT;EPOR;KITLG;C... | Paclitaxel | 16 |
| 4 | KEGG_2021_Human | ECM-receptor interaction | 14/88 | 0.016214 | 0.907660 | 0 | 0 | 2.043849 | 8.424479 | TNXB;LAMC3;GP1BB;ITGA2;DMP1;LAMC2;THBS2;IBSP;C... | Paclitaxel | 14 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 132680 | GO_Molecular_Function_2025 | Secondary Active Transmembrane Transporter Act... | 11/58 | 0.046917 | 0.809614 | 0 | 0 | 1.915785 | 5.861105 | SLC7A5;SLC35A1;CLCN6;SLC22A6;SLC46A2;SLC35D1;S... | TAS102 | 11 |
| 132681 | GO_Molecular_Function_2025 | Activin Receptor Activity | 3/8 | 0.047677 | 0.809614 | 0 | 0 | 4.904911 | 14.927115 | ACVRL1;ACVR1C;TGFBR2 | TAS102 | 3 |
| 132682 | GO_Molecular_Function_2025 | Double-Stranded Telomeric DNA Binding | 3/8 | 0.047677 | 0.809614 | 0 | 0 | 4.904911 | 14.927115 | XRCC6;TERF1;TERF2 | TAS102 | 3 |
| 132683 | GO_Molecular_Function_2025 | Tumor Necrosis Factor Receptor Activity | 3/8 | 0.047677 | 0.809614 | 0 | 0 | 4.904911 | 14.927115 | TNFRSF14;TNFRSF25;TNFRSF4 | TAS102 | 3 |
| 132684 | GO_Molecular_Function_2025 | Postsynaptic Neurotransmitter Receptor Activity | 5/19 | 0.048587 | 0.809788 | 0 | 0 | 2.920795 | 8.833634 | CHRNB2;CHRNA2;CHRNB3;CHRNA4;CHRNA6 | TAS102 | 5 |
6122 rows Ć 12 columns
InĀ [964]:
drug_to_category = {}
for cat, drugs in drug_category.items():
for d in drugs:
drug_to_category[d] = cat
InĀ [965]:
filtered_gsea['category'] = filtered_gsea['drug'].map(drug_to_category)
InĀ [966]:
go_by_cat = (
filtered_gsea.groupby('GO_term')
.agg({
'drug': lambda x: set(x),
'category': lambda x: set(x),
'Gene_set': lambda x: set(x)
})
.reset_index()
)
# Count number of drugs and categories per GO term
go_by_cat['num_drugs'] = go_by_cat['drug'].apply(len)
go_by_cat['num_categories'] = go_by_cat['category'].apply(len)
# --- Keep only overlaps WITHIN a single category ---
within_cat_go = go_by_cat[
(go_by_cat['num_drugs'] > 1) & (go_by_cat['num_categories'] >= 1)
].copy()
# Sort so biggest overlaps appear first
within_cat_go = within_cat_go.sort_values(['num_drugs'], ascending=False)
InĀ [967]:
within_cat_go
Out[967]:
| GO_term | drug | category | Gene_set | num_drugs | num_categories | |
|---|---|---|---|---|---|---|
| 3036 | Sequence-Specific DNA Binding | {FdU, Cisplatin, Carboplatin, TAS102, Doxorubi... | {Antimetabolite, Microtubule inhibitor, DNA cr... | {GO_Molecular_Function_2025} | 13 | 4 |
| 3037 | Sequence-Specific Double-Stranded DNA Binding | {FdU, Cisplatin, Carboplatin, TAS102, Doxorubi... | {Antimetabolite, Microtubule inhibitor, DNA cr... | {GO_Molecular_Function_2025} | 13 | 4 |
| 670 | Double-Stranded DNA Binding | {FdU, Cisplatin, Carboplatin, TAS102, Doxorubi... | {Antimetabolite, Microtubule inhibitor, DNA cr... | {GO_Molecular_Function_2025} | 13 | 4 |
| 3523 | bHLH Transcription Factor Binding | {FdU, Carboplatin, TAS102, Carmustine, Irinote... | {Antimetabolite, Microtubule inhibitor, DNA cr... | {GO_Molecular_Function_2025} | 12 | 4 |
| 589 | DNA-binding Transcription Activator Activity, ... | {FdU, Cisplatin, Carboplatin, TAS102, Carmusti... | {Antimetabolite, Microtubule inhibitor, DNA cr... | {GO_Molecular_Function_2025} | 11 | 4 |
| ... | ... | ... | ... | ... | ... | ... |
| 54 | Acetylcholine binding and downstream events | {Etoposide, TAS102} | {Antimetabolite, DNA strand break agent} | {BioPlanet_2019} | 2 | 2 |
| 55 | Acetylcholine-Gated Monoatomic Cation-Selectiv... | {Etoposide, TAS102} | {Antimetabolite, DNA strand break agent} | {GO_Molecular_Function_2025} | 2 | 2 |
| 31 | AKT Phosphorylates Targets in the Cytosol | {Bleomycin, TAS102} | {Antimetabolite, DNA strand break agent} | {Reactome_Pathways_2024} | 2 | 2 |
| 59 | Acidic Amino Acid Transmembrane Transporter Ac... | {5FU, Vinblastine} | {Antimetabolite, Microtubule inhibitor} | {GO_Molecular_Function_2025} | 2 | 2 |
| 65 | Activated Point Mutants of FGFR2 | {Etoposide, Vinblastine} | {Microtubule inhibitor, DNA strand break agent} | {Reactome_Pathways_2024} | 2 | 2 |
1327 rows Ć 6 columns
InĀ [968]:
# Define words/phrases you want to exclude
exclude_keywords = [
"development", "Heart", "Neuron", "Morphogenesis", "Muscle", "T cell", "T-helper", "synaptic",
"neurogenesis", "Beta", "SARS", "Olfactory", "Gatrulation", "carcinoma", "tuberculosis", "breast", "gastric", "stem", "leukemia",
"endometrial", "pancreatic", "circadian rhythm", "melanoma", "colorectal", "osteoclast", "bladder", "melanogenesis",
"cushing", "bladder", "virus", "neomycin", "RNA", "butanoate", "leishmaniasis", "glioma", "immunodeficiency", "vibrio", "cysteine", "Histidine",
"fatty", "Xenobiotic", "biosynthesis", "sugar", "cytokine", "Gastrulation", "Nephrogenesis"
]
# Build regex pattern for case-insensitive search
pattern = "|".join(exclude_keywords)
# Keep only GO terms that do NOT contain those words
within_cat_go_filtered = within_cat_go[
~within_cat_go['GO_term'].str.contains(pattern, case=False, na=False)
].copy()
InĀ [994]:
# Define keywords of interest
keywords = ["Hippo signaling pathway",
"Activation of BH3-only Proteins",
"TGF-beta signaling pathway",
"Cellular senescence",
"Wnt signaling pathway",
"SNARE interactions in vesicular transport",
# "Central carbon metabolism in cancer", "Drug metabolism", "Proteoglycans in cancer", "inflmmatory mediator regulation of TRP channels",
# "PI3K-Akt signaling pathway",
"Ferroptosis",
"NF-kappa B signaling pathway",
"Loss of Function of SMAD2 3 in cancer",
"Intrinsic Pathway for Apoptosis", "RAF Activation",
"TP53 Regulates Transcription of Genes Involved in G1 Cell Cycle Arrest",
"Pyroptosis",
"Positive Regulation of Cell Population Proliferation",
"Phagolysosome Assembly",
"Autophagosome Maturation",
"Positive Regulation of Autophagy",
"Phagolysosome Assembly",
"Stathmin and breast cancer resistance to antimicrotubule agents",
"PI3K AKT Signaling in Cancer",
"Myc active pathway",
"Regulation of KIT Signaling",
"Negative regulation of Telomere Maintenance",
# "Chk1 Chk2(Cds1) Mediated Inactivation of Cyclin B Cdk1 Complex",
"Regulated Necrosis",
"Tumor Necrosis Factor Receptor Activity",
"Pyrimidine metabolism",
]
# peripheral_gsea = filtered_gsea[
# filtered_gsea["GO_term"].str.contains("|".join(keywords), case=False, na=False)
# ]
peripheral_gsea = filtered_gsea[
filtered_gsea["GO_term"].isin(keywords)
]
InĀ [981]:
peripheral_gsea["drug"].unique()
Out[981]:
['Paclitaxel', 'Cisplatin', 'TFT', 'FdU', 'EdU', ..., 'Carmustine', 'Irinotecan', '6mercaptopurine', 'Vinblastine', 'TAS102'] Length: 16 Categories (16, object): ['TFT' < 'TAS102' < 'FdU' < 'EdU' ... 'Irinotecan' < 'Bleomycin' < 'Paclitaxel' < 'Vinblastine']
InĀ [976]:
import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1 import make_axes_locatable
# Ensure 'Combined Score' is float
peripheral_gsea['Combined Score'] = pd.to_numeric(peripheral_gsea['Combined Score'], errors='coerce')
peripheral_gsea = peripheral_gsea.dropna(subset=['Combined Score'])
peripheral_gsea['Odds Ratio'] = peripheral_gsea['Odds Ratio'].round().astype(int)
# Add log10 P-value
peripheral_gsea['log10_P-value'] = -np.log10(peripheral_gsea['P-value'])
peripheral_gsea['drug'] = pd.Categorical(
peripheral_gsea['drug'],
categories=ordered_drug_names,
ordered=True
)
# Plot
plt.figure(figsize=(30, 15))
scatter = sns.scatterplot(
data=peripheral_gsea,
x='drug',
y='GO_term',
size='Odds Ratio',
hue='log10_P-value',
palette='cool',
sizes=(150, 1000),
edgecolor='black',
linewidth=1,
legend=False # We'll add custom legends
)
# Custom size legend (Odds Ratio)
size_values = np.linspace(peripheral_gsea['Odds Ratio'].min(), peripheral_gsea['Odds Ratio'].max(), 3)
markers = []
labels = []
for size in size_values:
markers.append(plt.scatter([], [],
s=(50 + (size - size_values.min()) /
(size_values.max() - size_values.min()) * (1000-150)),
color='gray', edgecolor='black'))
labels.append(f"{size:.2f}")
plt.legend(markers, labels, title='Odds Ratio', bbox_to_anchor=(1, 1), loc='upper left')
# Colorbar for -log10(P-value), smaller and moved
norm = mcolors.Normalize(vmin=peripheral_gsea['log10_P-value'].min(),
vmax=peripheral_gsea['log10_P-value'].max())
sm = plt.cm.ScalarMappable(cmap='cool', norm=norm)
sm.set_array([])
# Use fraction to control size and pad to move it
cbar = plt.colorbar(sm, ax=scatter, fraction=0.03, pad=0.05)
cbar.set_label('-log10(P-value)', rotation=270, labelpad=40)
# Aesthetics
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='both', which='major', width=1, length=10)
plt.ylabel('')
plt.xlabel('')
plt.grid(False)
plt.tight_layout()
plt.show()
/tmp/ipykernel_1548459/3074881863.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
InĀ [1017]:
from textwrap import wrap
# Ensure 'Combined Score' is float
peripheral_gsea['Combined Score'] = pd.to_numeric(peripheral_gsea['Combined Score'], errors='coerce')
peripheral_gsea = peripheral_gsea.dropna(subset=['Combined Score'])
peripheral_gsea['Odds Ratio'] = peripheral_gsea['Odds Ratio'].round().astype(int)
# Add log10 P-value
peripheral_gsea['log10_P-value'] = -np.log10(peripheral_gsea['P-value'])
# Order drug categories if provided
peripheral_gsea['drug'] = pd.Categorical(
peripheral_gsea['drug'],
categories=ordered_drug_names,
ordered=True
)
# -----------------------------
# Wrap y-axis labels if too long
# -----------------------------
median_len = int(peripheral_gsea['GO_term'].str.len().median())
median_len = 35
def wrap_label(label, max_len=median_len):
"""Split GO terms into 2 lines if longer than median length."""
if len(label) > max_len:
return "\n".join(wrap(label, max_len))
return label
peripheral_gsea['GO_term_wrapped'] = peripheral_gsea['GO_term'].apply(wrap_label)
# -----------------------------
# Count overlaps per GO term
# -----------------------------
# Add counts to labels (optional, can remove if not wanted)
go_counts = peripheral_gsea.groupby('GO_term_wrapped')['drug'].nunique().sort_values(ascending=False)
# Reorder categorical axis (descending by overlap)
peripheral_gsea['GO_term_wrapped'] = pd.Categorical(
peripheral_gsea['GO_term_wrapped'],
categories=go_counts.index,
ordered=True
)
# -----------------------------
# Plot
# -----------------------------
plt.figure(figsize=(40, 35))
scatter = sns.scatterplot(
data=peripheral_gsea,
x='drug',
y='GO_term_wrapped',
size='Odds Ratio',
hue='log10_P-value',
palette='RdPu',
sizes=(350, 2000),
edgecolor='black',
linewidth=1,
legend=False # We'll add custom legends
)
# Custom size legend (Odds Ratio)
size_values = np.linspace(peripheral_gsea['Odds Ratio'].min(), peripheral_gsea['Odds Ratio'].max(), 3)
markers = []
labels = []
for size in size_values:
markers.append(plt.scatter([], [],
s=(50 + (size - size_values.min()) /
(size_values.max() - size_values.min()) * (2000-350)),
color='gray', edgecolor='black'))
labels.append(f"{size:.2f}")
plt.legend(markers, labels, title='Odds Ratio', bbox_to_anchor=(1, 1), loc='upper left')
# Colorbar for -log10(P-value)
norm = mcolors.Normalize(vmin=peripheral_gsea['log10_P-value'].min(),
vmax=peripheral_gsea['log10_P-value'].max())
sm = plt.cm.ScalarMappable(cmap='RdPu', norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=scatter, fraction=0.04, pad=0.05)
cbar.set_label('-log10(P-value)', rotation=270, labelpad=30)
cbar.locator = plt.MaxNLocator(nbins=9) # automatically choose ticks
cbar.update_ticks()
# Aesthetics
plt.xticks(rotation=45, ha='right')
plt.tick_params(axis='both', which='major', width=1, length=10)
plt.ylabel('')
plt.xlabel('')
plt.grid(False)
plt.tight_layout()
scatter.set_ylim(len(peripheral_gsea['GO_term_wrapped'].unique()) - 0.5, -0.5)
plt.savefig("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_Dotplot_peripheral.svg", format='svg', dpi=2000)
plt.show()
InĀ [524]:
filtered_gsea
Out[524]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | drug | gene_count | category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KEGG_2021_Human | TGF-beta signaling pathway | 19/94 | 0.000308 | 0.095476 | 0 | 0 | 2.744796 | 22.192908 | TGIF1;TGFB2;INHBB;INHBA;ACVR2B;BMP7;SMAD5;DCN;... | Paclitaxel | 19 | Microtubule inhibitor |
| 1 | KEGG_2021_Human | RNA degradation | 14/79 | 0.006392 | 0.711796 | 0 | 0 | 2.327992 | 11.762601 | LSM1;LSM5;TOB1;PATL1;HSPD1;LSM2;EDC4;CNOT4;LSM... | Paclitaxel | 14 | Microtubule inhibitor |
| 2 | KEGG_2021_Human | PI3K-Akt signaling pathway | 44/354 | 0.007054 | 0.711796 | 0 | 0 | 1.540904 | 7.633904 | PHLPP1;TNXB;CHRM1;CSF1;LAMC3;IFNA2;PDGFA;PIK3C... | Paclitaxel | 44 | Microtubule inhibitor |
| 3 | KEGG_2021_Human | Hematopoietic cell lineage | 16/99 | 0.009184 | 0.711796 | 0 | 0 | 2.083987 | 9.774403 | CSF1;GP1BB;ITGA2;FLT3LG;CD3G;DNTT;EPOR;KITLG;C... | Paclitaxel | 16 | Microtubule inhibitor |
| 4 | KEGG_2021_Human | ECM-receptor interaction | 14/88 | 0.016214 | 0.907660 | 0 | 0 | 2.043849 | 8.424479 | TNXB;LAMC3;GP1BB;ITGA2;DMP1;LAMC2;THBS2;IBSP;C... | Paclitaxel | 14 | Microtubule inhibitor |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4613 | KEGG_2021_Human | Tuberculosis | 28/180 | 0.034162 | 0.295125 | 0 | 0 | 1.510800 | 5.101411 | CEBPB;ITGAM;SRC;TCIRG1;HSPD1;MRC2;HLA-DMA;AKT2... | TAS102 | 28 | Antimetabolite |
| 4614 | KEGG_2021_Human | Fatty acid degradation | 9/43 | 0.039672 | 0.333462 | 0 | 0 | 2.166373 | 6.991107 | HADHB;ADH4;ACAA2;ECHS1;ACSL6;ACADM;HADH;ACAT2;... | TAS102 | 9 | Antimetabolite |
| 4615 | KEGG_2021_Human | Adherens junction | 13/71 | 0.042007 | 0.342269 | 0 | 0 | 1.835265 | 5.817630 | FARP2;TCF7L2;TCF7L1;SMAD3;YES1;SRC;NLK;RHOA;TG... | TAS102 | 13 | Antimetabolite |
| 4616 | KEGG_2021_Human | Thyroid cancer | 8/37 | 0.042921 | 0.342269 | 0 | 0 | 2.257272 | 7.106776 | TCF7L2;CDKN1A;NRAS;TCF7L1;GADD45B;BAX;POLK;MAPK3 | TAS102 | 8 | Antimetabolite |
| 4617 | KEGG_2021_Human | Circadian rhythm | 7/31 | 0.045901 | 0.356877 | 0 | 0 | 2.386169 | 7.352447 | PER1;FBXW11;RORB;BTRC;PRKAB1;NPAS2;ARNTL | TAS102 | 7 | Antimetabolite |
253 rows Ć 13 columns
InĀ [865]:
peripheral_gsea
Out[865]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | drug | gene_count | category | log10_P-value | GO_term_wrapped | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KEGG_2021_Human | TGF-beta signaling pathway | 19/94 | 0.000308 | 0.095476 | 0 | 0 | 3 | 22.192908 | TGIF1;TGFB2;INHBB;INHBA;ACVR2B;BMP7;SMAD5;DCN;... | Paclitaxel | 19 | Microtubule inhibitor | 3.511466 | TGF-beta signaling pathway |
| 2 | KEGG_2021_Human | PI3K-Akt signaling pathway | 44/354 | 0.007054 | 0.711796 | 0 | 0 | 2 | 7.633904 | PHLPP1;TNXB;CHRM1;CSF1;LAMC3;IFNA2;PDGFA;PIK3C... | Paclitaxel | 44 | Microtubule inhibitor | 2.151570 | PI3K-Akt signaling pathway |
| 323 | Reactome_Pathways_2024 | RAF Activation | 8/34 | 0.006587 | 0.797496 | 0 | 0 | 3 | 16.680278 | PPP2CA;CAMK2D;PPP2R1A;CAMK2A;ARAF;PPP2R5A;RAF1... | Paclitaxel | 8 | Microtubule inhibitor | 2.181313 | RAF Activation |
| 384 | Reactome_Pathways_2024 | Intrinsic Pathway for Apoptosis | 9/55 | 0.041190 | 0.900740 | 0 | 0 | 2 | 6.732095 | PPP1R13B;APAF1;DIABLO;STAT3;E2F1;TP53BP2;NMT1;... | Paclitaxel | 9 | Microtubule inhibitor | 1.385211 | Intrinsic Pathway for Apoptosis |
| 390 | Reactome_Pathways_2024 | PI3K AKT Signaling in Cancer | 15/110 | 0.045803 | 0.918209 | 0 | 0 | 2 | 5.256649 | EGF;CD80;MAPKAP1;FLT3LG;FOXO6;PDGFA;PIK3CD;FGF... | Paclitaxel | 15 | Microtubule inhibitor | 1.339110 | PI3K AKT Signaling in Cancer |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 124533 | Reactome_Pathways_2024 | Activation of BH3-only Proteins | 10/30 | 0.000898 | 0.204766 | 0 | 0 | 4 | 28.742425 | YWHAE;TFDP1;BAD;AKT2;AKT3;BCL2;PMAIP1;YWHAZ;BB... | TAS102 | 10 | Antimetabolite | 3.046676 | Activation of BH3-only Proteins |
| 124539 | Reactome_Pathways_2024 | Intrinsic Pathway for Apoptosis | 14/55 | 0.001878 | 0.244639 | 0 | 0 | 3 | 17.576975 | YWHAE;BAD;STAT3;GZMB;YWHAZ;BBC3;TFDP1;AKT2;AKT... | TAS102 | 14 | Antimetabolite | 2.726372 | Intrinsic Pathway for Apoptosis |
| 127094 | GO_Biological_Process_2025 | Positive Regulation of Cell Population Prolife... | 76/484 | 0.000705 | 0.156055 | 0 | 0 | 2 | 11.175628 | CNTFR;CSF2;KDM1A;BNC1;HTR2B;CIB1;TCIRG1;FGF1;A... | TAS102 | 76 | Antimetabolite | 3.151827 | Positive Regulation of Cell Population\nProlif... |
| 131310 | BioPlanet_2019 | Myc active pathway | 19/85 | 0.001739 | 0.196014 | 0 | 0 | 2 | 15.013030 | EIF4A1;SMAD3;CDCA7;HMGA1;HSPD1;PRDX3;FOSL1;KAT... | TAS102 | 19 | Antimetabolite | 2.759639 | Myc active pathway |
| 131440 | BioPlanet_2019 | Wnt signaling pathway | 34/231 | 0.043339 | 0.409109 | 0 | 0 | 1 | 4.443843 | NLK;PRKCZ;SOX2;GJA1;CCND3;SOX17;AKT2;HNF4A;AKT... | TAS102 | 34 | Antimetabolite | 1.363120 | Wnt signaling pathway |
83 rows Ć 15 columns
InĀ [278]:
# Group GO terms by drug
go_to_drugs = (
filtered_gsea.groupby('GO_term')['drug']
.apply(set) # collect drugs per GO term
.reset_index()
)
# Keep only GO terms associated with >1 drug (common terms)
common_go_terms = go_to_drugs[go_to_drugs['drug'].apply(lambda x: len(x) > 1)].copy()
print(f"Found {len(common_go_terms)} GO terms common to multiple drugs")
# If you want to see how many drugs each term is shared across
common_go_terms['num_drugs'] = common_go_terms['drug'].apply(len)
# Sort by number of drugs sharing the GO term
common_go_terms = common_go_terms.sort_values('num_drugs', ascending=False)
Found 464 GO terms common to multiple drugs
InĀ [186]:
# List of irrelevant keywords to filter out (add/remove as needed)
irrelevant_keywords = [
# "Glial", "development", "mediated", "Transport",
"neuronal", "Translation", "neural", "adipose", 'olfactory', "cancer"
]
# add more if you want to exclude
# Ensure 'Combined Score' is float type
filtered_gsea['Combined Score'] = pd.to_numeric(filtered_gsea['Combined Score'], errors='coerce')
# Drop rows with NaNs in Combined Score just in case
filtered_gsea = filtered_gsea.dropna(subset=['Combined Score'])
# Filter out rows containing any irrelevant keywords in 'GO_term' (case insensitive)
pattern = '|'.join(irrelevant_keywords)
mask = ~filtered_gsea['GO_term'].str.lower().str.contains(pattern)
filtered_gsea = filtered_gsea[mask]
# Get top 2 combined score rows per drug after filtering
top2_by_drug = (
filtered_gsea
.sort_values(by=['drug', 'Combined Score'], ascending=[True, False])
.groupby('drug')
.head(1)
.reset_index(drop=True)
)
top2_by_drug_sorted = top2_by_drug.copy()
top2_by_drug_sorted['log10_P-value'] = -np.log10(top2_by_drug_sorted['P-value'])
# Plot
plt.figure(figsize=(30, 30))
sns.scatterplot(
data=top2_by_drug_sorted,
x='drug',
y='GO_term',
size='Odds Ratio',
hue='log10_P-value',
palette='coolwarm_r',
sizes=(50, 400),
edgecolor='black',
legend='brief'
)
# Aesthetics
plt.xticks(rotation=45, ha='right')
plt.xlabel('Drug')
plt.ylabel('Top GO:BP Terms')
plt.tight_layout()
plt.grid(True, linestyle='--', alpha=0.3)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
InĀ [256]:
moa_keywords = {
# Microtubule inhibitors
'Paclitaxel': ['microtubule stabilization', 'mitotic arrest', 'spindle assembly', 'tubulin polymerization', 'chromosome segregation', 'microtubule dynamics', 'microtubule'],
'Vinblastine': ['microtubule depolymerization', 'spindle disruption', 'tubulin binding', 'mitotic arrest', 'cell cycle', 'chromosome segregation', 'microtubule assembly'],
# DNA cross-linking agents (alkylating/platinum-based)
'Cisplatin': ['platinum-based', 'dna crosslink', 'alkylation', 'dna repair', 'adduct formation', 'replication fork stalling', 'apoptosis','autophagy'],
'Carboplatin': ['platinum-based', 'dna crosslink', 'alkylation', 'dna repair', 'adduct formation', 'replication stress', 'apoptosis', 'autophagy'],
'Carmustine': ['nitrosourea alkylation', 'dna crosslink', 'alkylation', 'dna damage', 'carbamoylation', 'dna repair inhibition', 'mutagenesis', 'autophagy'],
'MitomycinC': ['dna crosslink', 'bioreductive activation', 'dna synthesis inhibition', 'hypoxia activation', 'crosslink repair', 'dna damage', 'autophagy'],
# DNA strand break / topoisomerase inhibitors
'Doxorubicin': ['topoisomerase inhibition', 'dna intercalation', 'free radical formation', 'oxidative stress', 'apoptosis', 'dna damage', 'autophagy'],
'Etoposide': ['topoisomerase inhibition', 'dna double strand breaks', 'cell cycle arrest', 'checkpoint activation', 'apoptosis', 'dna damage', 'replication stress', 'autophagy'],
'Irinotecan': ['topoisomerase inhibition', 'cell cycle arrest', 'dna damage', 'checkpoint activation', 'replication stress', 'apoptosis', 'autophagy'],
'Bleomycin': ['dna strand cleavage', 'oxidative stress', 'free radical generation', 'dna damage', 'apoptosis', 'cell cycle arrest', 'autophagy'],
# Antimetabolites
'TFT': ['WNT', 'cell cycle', 'thymidine', 'nucleotide', 'dna synthesis inhibition', 'folate', 'autophagy'],
'TAS102': ['nucleoside metabolic inhibition', 'thymidine analog', 'dna synthesis inhibition', 'thymidine kinase', 'replication stress', 'cell cycle arrest', 'lyso'],
'FdU': ['dna chain termination', 'fluorodeoxyuridine', 'thymidylate synthase inhibition', 'dna synthesis inhibition', 'cell cycle arrest', 'nucleotide analog','autophagy'],
'EdU': ['dna chain termination', 'thymidine analog', 'dna synthesis', 's-phase progression', 'cell proliferation', 'dna','autophagy'],
'5FU': ['thymidylate synthase inhibition', 'fluorouracil', 'rna metabolism disruption', 'dna damage', 'cell cycle arrest', 'nucleotide metabolism','autophagy'],
# '6mercaptopurine': ['DNA', 'purine metabolism inhibition', 'rna synthesis inhibition', 'dna synthesis inhibition', 'thiopurine metabolism', 'immunosuppression', 'cell proliferation inhibition'],
'6mercaptopurine': ['DNA', 'synthesis', 'purine', 'dna synthesis', 'thiopurine', 'immunosuppression', 'cell proliferation','autophagy'],
}
InĀ [262]:
import pandas as pd
filtered_moa_results = []
for drug, keywords in moa_keywords.items():
df_drug = combined_gsea[combined_gsea['drug'] == drug].copy()
df_drug['GO_term_lower'] = df_drug['GO_term'].str.lower()
pattern = '|'.join(keywords)
mask = df_drug['GO_term_lower'].str.contains(pattern)
filtered_drug = df_drug[mask]
filtered_moa_results.append(filtered_drug)
filtered_moa_gsea = pd.concat(filtered_moa_results, ignore_index=True)
filtered_moa_gsea = filtered_moa_gsea[filtered_moa_gsea['gene_count'] >= 2].copy()
InĀ [224]:
# import pandas as pd
# filtered_moa_results = []
# for drug, keywords in moa_keywords.items():
# df_drug = filtered_gsea[filtered_gsea['drug'] == drug].copy()
# df_drug['GO_term_lower'] = df_drug['GO_term'].str.lower()
# pattern = '|'.join(keywords)
# mask = df_drug['GO_term_lower'].str.contains(pattern)
# filtered_drug = df_drug[mask]
# filtered_moa_results.append(filtered_drug)
# filtered_moa_gsea = pd.concat(filtered_moa_results, ignore_index=True)
# filtered_moa_gsea.drop(columns=['GO_term_lower'], inplace=True)
InĀ [263]:
filtered_moa_gsea
Out[263]:
| Gene_set | GO_term | Overlap | P-value | p_adj | Old P-value | Old Adjusted P-value | Odds Ratio | Combined Score | Genes | drug | gene_count | GO_term_lower | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GO_Biological_Process_2025 | Vesicle Transport Along Microtubule (GO:0047496) | 3/24 | 0.084750 | 0.806303 | 0 | 0 | 3.141049 | 7.752249 | KIF3A;KIF1C;TRAK1 | Paclitaxel | 3 | vesicle transport along microtubule (go:0047496) |
| 2 | GO_Biological_Process_2025 | Protein Localization to Microtubule Organizing... | 2/19 | 0.199940 | 0.806303 | 0 | 0 | 2.584314 | 4.160067 | DCTN2;CSNK1D | Paclitaxel | 2 | protein localization to microtubule organizing... |
| 3 | GO_Biological_Process_2025 | Transport Along Microtubule (GO:0010970) | 2/20 | 0.216135 | 0.806303 | 0 | 0 | 2.440613 | 3.738653 | BAG3;COPG1 | Paclitaxel | 2 | transport along microtubule (go:0010970) |
| 6 | GO_Biological_Process_2025 | Regulation of Mitotic Spindle Assembly (GO:190... | 2/22 | 0.248816 | 0.806303 | 0 | 0 | 2.196322 | 3.055173 | PDCD6IP;VPS4B | Paclitaxel | 2 | regulation of mitotic spindle assembly (go:190... |
| 8 | GO_Biological_Process_2025 | Regulation of Spindle Assembly (GO:0090169) | 2/31 | 0.394154 | 0.806303 | 0 | 0 | 1.513991 | 1.409545 | PDCD6IP;VPS4B | Paclitaxel | 2 | regulation of spindle assembly (go:0090169) |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 483 | GO_Biological_Process_2025 | Positive Regulation of Smooth Muscle Cell Prol... | 2/49 | 0.343011 | 0.673254 | 0 | 0 | 1.679513 | 1.797067 | TGFB1;HTR1B | 6mercaptopurine | 2 | positive regulation of smooth muscle cell prol... |
| 484 | GO_Biological_Process_2025 | Regulation of Autophagy (GO:0010506) | 7/230 | 0.343498 | 0.673254 | 0 | 0 | 1.240296 | 1.325347 | SETD2;ENDOG;BAD;LEP;PIP4K2A;SVIP;SIRT2 | 6mercaptopurine | 7 | regulation of autophagy (go:0010506) |
| 487 | GO_Biological_Process_2025 | Regulation of Endothelial Cell Proliferation (... | 3/87 | 0.365308 | 0.675820 | 0 | 0 | 1.409771 | 1.419660 | SEMA5A;LEP;WNT5A | 6mercaptopurine | 3 | regulation of endothelial cell proliferation (... |
| 491 | GO_Biological_Process_2025 | Negative Regulation of Autophagy (GO:0010507) | 2/65 | 0.480656 | 0.713921 | 0 | 0 | 1.251940 | 0.917175 | LEP;SIRT2 | 6mercaptopurine | 2 | negative regulation of autophagy (go:0010507) |
| 497 | GO_Biological_Process_2025 | Macroautophagy (GO:0016236) | 3/162 | 0.768131 | 0.842545 | 0 | 0 | 0.741908 | 0.195711 | GABARAPL2;STX17;PIP4K2A | 6mercaptopurine | 3 | macroautophagy (go:0016236) |
280 rows Ć 13 columns
InĀ [259]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as mcolors
# List of irrelevant keywords to filter out (add/remove as needed)
irrelevant_keywords = [
"cancer", 'export', 'melanoma', 'kanamycin', 'stem', 'Thymus', 'viral', 'cholesterol'
# add more if you want to exclude
]
# Ensure 'Combined Score' is float type
filtered_moa_gsea['Combined Score'] = pd.to_numeric(filtered_moa_gsea['Combined Score'], errors='coerce')
# Drop rows with NaNs in Combined Score just in case
filtered_gsea = filtered_moa_gsea.dropna(subset=['Combined Score'])
# Filter out rows containing any irrelevant keywords in 'GO_term' (case insensitive)
pattern = '|'.join(irrelevant_keywords)
mask = ~filtered_gsea['GO_term'].str.lower().str.contains(pattern)
filtered_gsea = filtered_gsea[mask]
# Get top 2 combined score rows per drug after filtering
top2_by_drug = (
filtered_gsea
.sort_values(by=['drug', 'Combined Score'], ascending=[True, False])
.groupby('drug')
.head(2)
.reset_index(drop=True)
)
top2_by_drug_sorted = top2_by_drug.copy()
top2_by_drug_sorted['log10_P-value'] = -np.log10(top2_by_drug_sorted['P-value'])
# š¹ Ensure drug column follows custom order
top2_by_drug_sorted['drug'] = pd.Categorical(
top2_by_drug_sorted['drug'],
categories=ordered_drug_names,
ordered=True
)
# Sort by that categorical order so plotting respects it
top2_by_drug_sorted = top2_by_drug_sorted.sort_values('drug')
# Create the plot
plt.figure(figsize=(40, 15))
# Plot with legend=False to prevent seaborn from adding any legend automatically
scatter = sns.scatterplot(
data=top2_by_drug_sorted,
x='drug',
y='GO_term',
size='Odds Ratio',
hue='log10_P-value',
palette='coolwarm_r',
sizes=(50, 400),
edgecolor='black',
legend=False # Disable seaborn's automatic legend
)
# Add size legend manually
# Create some legend handles for Odds Ratio sizes
import matplotlib.patches as mpatches
from matplotlib.legend_handler import HandlerTuple
# Get some example sizes for the legend
size_values = np.linspace(top2_by_drug_sorted['Odds Ratio'].min(), top2_by_drug_sorted['Odds Ratio'].max(), 3)
markers = []
labels = []
for size in size_values:
markers.append(plt.scatter([], [], s=(50 + (size - size_values.min()) / (size_values.max() - size_values.min()) * (400-50)),
color='gray', edgecolor='black'))
labels.append(f"{size:.2f}")
# Create the legend for size
plt.legend(markers, labels, title='Odds Ratio', bbox_to_anchor=(0.75, 1), loc='upper left')
# Add colorbar for -log10(P-value)
norm = mcolors.Normalize(vmin=top2_by_drug_sorted['log10_P-value'].min(),
vmax=top2_by_drug_sorted['log10_P-value'].max())
sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm, ax=scatter)
cbar.set_label('-log10(P-value)', rotation=270, labelpad=50)
plt.xticks(rotation=45, ha='right')
plt.xlabel('Drug')
plt.ylabel('Top Two REACTOME Term')
# plt.grid(True, linestyle='--', alpha=0.3)
plt.grid(False)
plt.tight_layout()
plt.show()
InĀ [179]:
CCDC86_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20250805_Figure4_Survival.xlsx", sheet_name ='CCDC86')
GHSR_df = pd.read_excel("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_db/20250805_Figure4_Survival.xlsx", sheet_name ='GHSR')
InĀ [180]:
data = CCDC86_df.iloc[3:].copy()
# Convert to numeric
data['Survival Rate'] = pd.to_numeric(data['Unnamed: 4'], errors='coerce')
data['Time (months)'] = pd.to_numeric(data['Unnamed: 5'], errors='coerce')
data['Type'] = data['Unnamed: 6']
# Convert months to years
data['Time (years)'] = data['Time (months)'] / 12
# Count samples per group
group_counts = data['Type'].value_counts().to_dict()
colors = {'High': '#007A03', 'Low': '#FFA90E'}
Cutoff = -1.6
plt.figure(figsize=(8, 8))
for group in ['High', 'Low']:
group_data = data[data['Type'] == group].sort_values('Time (years)')
plt.step(group_data['Time (years)'], group_data['Survival Rate'], where='post',
color=colors[group], linewidth=7, label=f"{group} (n={group_counts.get(group,0)})")
legend_handles = [Line2D([0], [0], color=colors[g], lw=3, label=f"{g} (n={group_counts.get(g,0)})") for g in ['High', 'Low']]
# legend_handles.append(Line2D([0], [0], color='none', label=f"Cutoff = {Cutoff}"))
legend_handles.append(Line2D([0], [0], color='none', label=r"$P < 10^{-10}$"))
plt.legend(handles=legend_handles, loc='upper right', frameon=False, fontsize=14)
plt.title("CCDC86", fontsize=20)
plt.xlabel("Time (years)", fontsize=16)
plt.ylabel("Survival Probability", fontsize=16)
plt.ylim(0, 1.05)
plt.xlim(0, data['Time (years)'].max())
plt.grid(False)
plt.tight_layout()
plt.savefig("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_CCDC86.svg", format='svg', dpi=1000)
plt.show()
InĀ [181]:
data = GHSR_df.iloc[3:].copy()
# Convert to numeric
data['Survival Rate'] = pd.to_numeric(data['Unnamed: 4'], errors='coerce')
data['Time (months)'] = pd.to_numeric(data['Unnamed: 5'], errors='coerce')
data['Type'] = data['Unnamed: 6']
# Convert months to years
data['Time (years)'] = data['Time (months)'] / 12
# Count samples per group
group_counts = data['Type'].value_counts().to_dict()
colors = {'High': '#007A03', 'Low': '#FFA90E'}
Cutoff = -4.28
plt.figure(figsize=(8, 8))
for group in ['High', 'Low']:
group_data = data[data['Type'] == group].sort_values('Time (years)')
plt.step(group_data['Time (years)'], group_data['Survival Rate'], where='post',
color=colors[group], linewidth=7, label=f"{group} (n={group_counts.get(group,0)})")
legend_handles = [Line2D([0], [0], color=colors[g], lw=3, label=f"{g} (n={group_counts.get(g,0)})") for g in ['High', 'Low']]
# legend_handles.append(Line2D([0], [0], color='none', label=f"Cutoff = {Cutoff}"))
legend_handles.append(Line2D([0], [0], color='none', label=r"$P < 10^{-10}$"))
plt.legend(handles=legend_handles, loc='upper right', frameon=False, fontsize=14)
plt.title("GHSR", fontsize=20)
plt.xlabel("Time (years)", fontsize=16)
plt.ylabel("Survival Probability", fontsize=16)
plt.ylim(0, 1.05)
plt.xlim(0, data['Time (years)'].max())
plt.grid(False)
plt.tight_layout()
plt.savefig("/home/harryjo/rnaseq_analysis/RQ023682/RQ023682_graphs/Statistical_Graph/RQ023682_GHSR.svg", format='svg', dpi=1000)
plt.show()
InĀ [182]:
# def plot_gene_counts_barchart(df: pd.DataFrame, drug_name_list: list):
# """
# Generates a single grouped bar chart showing the number of up-regulated
# and down-regulated genes for each drug.
# Args:
# df (pd.DataFrame): The DataFrame containing the drug data.
# drug_name_list (list): A list of drug names to iterate through.
# """
# up_counts = []
# down_counts = []
# for drug_name in drug_name_list:
# nan_col = f'nan_filter_{drug_name}'
# fc_col = f'FC_{drug_name}'
# pval_col = f'ovp3_{drug_name}'
# # Skip if columns are missing for a drug
# if not all(col in df.columns for col in [nan_col, fc_col, pval_col]):
# print(f"Warning: Skipping {drug_name} as one or more required columns are missing.")
# up_counts.append(0)
# down_counts.append(0)
# continue
# # Filter for up-regulated genes (FC >= 0.5)
# up_regulated = df[
# (df[nan_col] == 1) &
# (df[fc_col] >= 0.5) &
# (df[pval_col] <= 0.05)
# ]
# up_counts.append(len(up_regulated))
# # Filter for down-regulated genes (FC <= -0.5)
# down_regulated = df[
# (df[nan_col] == 1) &
# (df[fc_col] <= -0.5) &
# (df[pval_col] <= 0.05)
# ]
# down_counts.append(len(down_regulated))
# # Set up the plot
# bar_width = 0.35
# index = np.arange(len(drug_name_list))
# fig, ax = plt.subplots(figsize=(16, 16))
# bar1 = ax.bar(index, up_counts, bar_width, label='Up-regulated (>0.5)', color='skyblue')
# bar2 = ax.bar(index + bar_width, down_counts, bar_width, label='Down-regulated (<-0.5)', color='salmon')
# # Add text labels on top of the bars
# def add_labels(bars):
# for bar in bars:
# height = bar.get_height()
# if height > 0:
# ax.text(bar.get_x() + bar.get_width() / 2., height,
# '%d' % int(height),
# ha='center', va='bottom')
# add_labels(bar1)
# add_labels(bar2)
# # Customize the plot
# ax.set_xlabel('Drug Name', fontsize=12)
# ax.set_ylabel('Number of Genes', fontsize=12)
# ax.set_title('Number of Up-regulated and Down-regulated Genes per Drug', fontsize=16)
# ax.set_xticks(index + bar_width / 2)
# ax.set_xticklabels(drug_name_list, rotation=90, ha='center')
# ax.legend()
# ax.grid(axis='y', linestyle='--', alpha=0.6)
# plt.tight_layout()
# plt.show()